Make clang generic vector backend support 16, 32, and 64-byte vectors

libeigen/eigen!2213 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-10 11:34:33 +08:00 · 2026-02-25 08:50:47 -08:00
parent ea25ea52bb
commit 4fab38d798
9 changed files with 1249 additions and 346 deletions
--- a/Eigen/src/Core/arch/clang/Complex.h
+++ b/Eigen/src/Core/arch/clang/Complex.h
@@ -27,11 +27,21 @@ struct complex_packet_wrapper {
  RealPacketT v;
 };

-using Packet8cf = complex_packet_wrapper<float, 8>;
-using Packet4cf = complex_packet_wrapper<float, 4>;
+// --- Primary complex packet aliases ---
+constexpr int kComplexFloatSize = kFloatPacketSize / 2;    // 2, 4, or 8
+constexpr int kComplexDoubleSize = kDoublePacketSize / 2;  // 1, 2, or 4
+using PacketXcf = complex_packet_wrapper<float, kComplexFloatSize>;
+using PacketXcd = complex_packet_wrapper<double, kComplexDoubleSize>;
+
+// Sub-packet types needed for reductions at larger sizes.
+// When PacketXcf IS already a given size, we skip the alias to avoid duplicates.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
 using Packet2cf = complex_packet_wrapper<float, 2>;
-using Packet4cd = complex_packet_wrapper<double, 4>;
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+using Packet4cf = complex_packet_wrapper<float, 4>;
 using Packet2cd = complex_packet_wrapper<double, 2>;
+#endif

 struct generic_complex_packet_traits : default_packet_traits {
  enum {
@@ -58,39 +68,39 @@ struct generic_complex_packet_traits : default_packet_traits {

 template <>
 struct packet_traits<std::complex<float>> : generic_complex_packet_traits {
-  using type = Packet8cf;
-  using half = Packet8cf;
+  using type = PacketXcf;
+  using half = PacketXcf;
  enum {
-    size = 8,
+    size = kComplexFloatSize,
  };
 };

 template <>
-struct unpacket_traits<Packet8cf> : generic_unpacket_traits {
+struct unpacket_traits<PacketXcf> : generic_unpacket_traits {
  using type = std::complex<float>;
-  using half = Packet8cf;
-  using as_real = Packet16f;
+  using half = PacketXcf;
+  using as_real = PacketXf;
  enum {
-    size = 8,
+    size = kComplexFloatSize,
  };
 };

 template <>
 struct packet_traits<std::complex<double>> : generic_complex_packet_traits {
-  using type = Packet4cd;
-  using half = Packet4cd;
+  using type = PacketXcd;
+  using half = PacketXcd;
  enum {
-    size = 4,
+    size = kComplexDoubleSize,
  };
 };

 template <>
-struct unpacket_traits<Packet4cd> : generic_unpacket_traits {
+struct unpacket_traits<PacketXcd> : generic_unpacket_traits {
  using type = std::complex<double>;
-  using half = Packet4cd;
-  using as_real = Packet8d;
+  using half = PacketXcd;
+  using as_real = PacketXd;
  enum {
-    size = 4,
+    size = kComplexDoubleSize,
  };
 };

@@ -115,24 +125,58 @@ struct unpacket_traits<Packet4cd> : generic_unpacket_traits {
    pstore(&numext::real_ref(*to), from.v);                                                               \
  }

-EIGEN_CLANG_COMPLEX_LOAD_STORE(Packet8cf);
-EIGEN_CLANG_COMPLEX_LOAD_STORE(Packet4cd);
+EIGEN_CLANG_COMPLEX_LOAD_STORE(PacketXcf);
+EIGEN_CLANG_COMPLEX_LOAD_STORE(PacketXcd);
 #undef EIGEN_CLANG_COMPLEX_LOAD_STORE

-template <>
-EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from) {
-  const float re = numext::real(from);
-  const float im = numext::imag(from);
-  return Packet8cf(Packet16f{re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im});
-}
+// --- pset1 for complex ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

 template <>
-EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from) {
+EIGEN_STRONG_INLINE PacketXcf pset1<PacketXcf>(const std::complex<float>& from) {
+  const float re = numext::real(from);
+  const float im = numext::imag(from);
+  return PacketXcf(PacketXf{re, im, re, im});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pset1<PacketXcd>(const std::complex<double>& from) {
  const double re = numext::real(from);
  const double im = numext::imag(from);
-  return Packet4cd(Packet8d{re, im, re, im, re, im, re, im});
+  return PacketXcd(PacketXd{re, im});
 }

+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pset1<PacketXcf>(const std::complex<float>& from) {
+  const float re = numext::real(from);
+  const float im = numext::imag(from);
+  return PacketXcf(PacketXf{re, im, re, im, re, im, re, im});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pset1<PacketXcd>(const std::complex<double>& from) {
+  const double re = numext::real(from);
+  const double im = numext::imag(from);
+  return PacketXcd(PacketXd{re, im, re, im});
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pset1<PacketXcf>(const std::complex<float>& from) {
+  const float re = numext::real(from);
+  const float im = numext::imag(from);
+  return PacketXcf(PacketXf{re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pset1<PacketXcd>(const std::complex<double>& from) {
+  const double re = numext::real(from);
+  const double im = numext::imag(from);
+  return PacketXcd(PacketXd{re, im, re, im, re, im, re, im});
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
 // ----------- Unary ops ------------------
 #define DELEGATE_UNARY_TO_REAL_OP(PACKET_TYPE, OP)                        \
  template <>                                                             \
@@ -149,134 +193,348 @@ EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from)
  }                                                                                                  \
  EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(PACKET_TYPE)

-EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(Packet8cf);
-EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(Packet4cd);
+EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(PacketXcf);
+EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(PacketXcd);
+
+// --- pconj ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

 template <>
-EIGEN_STRONG_INLINE Packet8cf pconj<Packet8cf>(const Packet8cf& a) {
-  return Packet8cf(__builtin_shufflevector(a.v, -a.v, 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31));
+EIGEN_STRONG_INLINE PacketXcf pconj<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
 }
 template <>
+EIGEN_STRONG_INLINE PacketXcd pconj<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 3));
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pconj<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pconj<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pconj<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pconj<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// Sub-packet pconj specializations needed for reductions.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
+}
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+template <>
 EIGEN_STRONG_INLINE Packet4cf pconj<Packet4cf>(const Packet4cf& a) {
  return Packet4cf(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
 }
 template <>
-EIGEN_STRONG_INLINE Packet2cf pconj<Packet2cf>(const Packet2cf& a) {
-  return Packet2cf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4cd pconj<Packet4cd>(const Packet4cd& a) {
-  return Packet4cd(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
-}
-template <>
 EIGEN_STRONG_INLINE Packet2cd pconj<Packet2cd>(const Packet2cd& a) {
  return Packet2cd(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
 }
+#endif

 #undef DELEGATE_UNARY_TO_REAL_OP
 #undef EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS

 // Flip real and imaginary parts, i.e.  {re(a), im(a)} -> {im(a), re(a)}.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& a) {
-  return Packet8cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
+EIGEN_STRONG_INLINE PacketXcf pcplxflip<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
 }
 template <>
+EIGEN_STRONG_INLINE PacketXcd pcplxflip<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0));
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pcplxflip<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pcplxflip<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pcplxflip<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pcplxflip<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// Sub-packet pcplxflip specializations needed for reductions.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
+}
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+template <>
 EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& a) {
  return Packet4cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
 }
 template <>
-EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
-  return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
-}
-template <>
-EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& a) {
-  return Packet4cd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
-}
-template <>
 EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& a) {
  return Packet2cd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
 }
+#endif

 // Copy real to imaginary part, i.e. {re(a), im(a)} -> {re(a), re(a)}.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE Packet8cf pdupreal<Packet8cf>(const Packet8cf& a) {
-  return Packet8cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14));
+EIGEN_STRONG_INLINE PacketXcf pdupreal<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
 }
 template <>
+EIGEN_STRONG_INLINE PacketXcd pdupreal<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0));
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pdupreal<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pdupreal<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pdupreal<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pdupreal<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// Sub-packet pdupreal specializations needed for reductions.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdupreal<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
+}
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+template <>
 EIGEN_STRONG_INLINE Packet4cf pdupreal<Packet4cf>(const Packet4cf& a) {
  return Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
 }
 template <>
-EIGEN_STRONG_INLINE Packet2cf pdupreal<Packet2cf>(const Packet2cf& a) {
-  return Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
-}
-template <>
-EIGEN_STRONG_INLINE Packet4cd pdupreal<Packet4cd>(const Packet4cd& a) {
-  return Packet4cd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
-}
-template <>
 EIGEN_STRONG_INLINE Packet2cd pdupreal<Packet2cd>(const Packet2cd& a) {
  return Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
 }
+#endif

 // Copy imaginary to real part, i.e. {re(a), im(a)} -> {im(a), im(a)}.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE Packet8cf pdupimag<Packet8cf>(const Packet8cf& a) {
-  return Packet8cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15));
+EIGEN_STRONG_INLINE PacketXcf pdupimag<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
 }
 template <>
+EIGEN_STRONG_INLINE PacketXcd pdupimag<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1));
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pdupimag<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pdupimag<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf pdupimag<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd pdupimag<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// Sub-packet pdupimag specializations needed for reductions.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdupimag<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
+}
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+template <>
 EIGEN_STRONG_INLINE Packet4cf pdupimag<Packet4cf>(const Packet4cf& a) {
  return Packet4cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
 }
 template <>
-EIGEN_STRONG_INLINE Packet2cf pdupimag<Packet2cf>(const Packet2cf& a) {
-  return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
-}
-template <>
-EIGEN_STRONG_INLINE Packet4cd pdupimag<Packet4cd>(const Packet4cd& a) {
-  return Packet4cd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
-}
-template <>
 EIGEN_STRONG_INLINE Packet2cd pdupimag<Packet2cd>(const Packet2cd& a) {
  return Packet2cd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
 }
+#endif
+
+// --- ploaddup ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

 template <>
-EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from) {
-  return Packet8cf(Packet16f{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
-                             std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
-                             std::real(from[2]), std::imag(from[2]), std::real(from[2]), std::imag(from[2]),
-                             std::real(from[3]), std::imag(from[3]), std::real(from[3]), std::imag(from[3])});
+EIGEN_STRONG_INLINE PacketXcf ploaddup<PacketXcf>(const std::complex<float>* from) {
+  return pset1<PacketXcf>(*from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
-  return Packet4cd(Packet8d{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+EIGEN_STRONG_INLINE PacketXcd ploaddup<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf ploaddup<PacketXcf>(const std::complex<float>* from) {
+  return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd ploaddup<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf ploaddup<PacketXcf>(const std::complex<float>* from) {
+  return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
+                            std::real(from[2]), std::imag(from[2]), std::real(from[2]), std::imag(from[2]),
+                            std::real(from[3]), std::imag(from[3]), std::real(from[3]), std::imag(from[3])});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd ploaddup<PacketXcd>(const std::complex<double>* from) {
+  return PacketXcd(PacketXd{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
 }

-template <>
-EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from) {
-  return Packet8cf(Packet16f{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
-                             std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
-                             std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
-                             std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
-}
-template <>
-EIGEN_STRONG_INLINE Packet4cd ploadquad<Packet4cd>(const std::complex<double>* from) {
-  return pset1<Packet4cd>(*from);
-}
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- ploadquad ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

 template <>
-EIGEN_STRONG_INLINE Packet8cf preverse<Packet8cf>(const Packet8cf& a) {
-  return Packet8cf(reinterpret_cast<Packet16f>(preverse(reinterpret_cast<Packet8d>(a.v))));
+EIGEN_STRONG_INLINE PacketXcf ploadquad<PacketXcf>(const std::complex<float>* from) {
+  return pset1<PacketXcf>(*from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet4cd preverse<Packet4cd>(const Packet4cd& a) {
-  return Packet4cd(__builtin_shufflevector(a.v, a.v, 6, 7, 4, 5, 2, 3, 0, 1));
+EIGEN_STRONG_INLINE PacketXcd ploadquad<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
 }

+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf ploadquad<PacketXcf>(const std::complex<float>* from) {
+  return pset1<PacketXcf>(*from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd ploadquad<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf ploadquad<PacketXcf>(const std::complex<float>* from) {
+  return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                            std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
+                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
+                            std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd ploadquad<PacketXcd>(const std::complex<double>* from) {
+  return pset1<PacketXcd>(*from);
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- preverse ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf preverse<PacketXcf>(const PacketXcf& a) {
+  // 2 complex floats: swap pairs (0,1) and (2,3)
+  return PacketXcf(__builtin_shufflevector(a.v, a.v, 2, 3, 0, 1));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd preverse<PacketXcd>(const PacketXcd& a) {
+  // 1 complex double: identity
+  return a;
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf preverse<PacketXcf>(const PacketXcf& a) {
+  // 4 complex floats: reverse pairs
+  return PacketXcf(reinterpret_cast<PacketXf>(preverse(reinterpret_cast<PacketXd>(a.v))));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd preverse<PacketXcd>(const PacketXcd& a) {
+  // 2 complex doubles: swap pairs
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 2, 3, 0, 1));
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXcf preverse<PacketXcf>(const PacketXcf& a) {
+  return PacketXcf(reinterpret_cast<PacketXf>(preverse(reinterpret_cast<PacketXd>(a.v))));
+}
+template <>
+EIGEN_STRONG_INLINE PacketXcd preverse<PacketXcd>(const PacketXcd& a) {
+  return PacketXcd(__builtin_shufflevector(a.v, a.v, 6, 7, 4, 5, 2, 3, 0, 1));
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
 // ----------- Binary ops ------------------
 #define DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, OP)                                             \
  template <>                                                                                   \
@@ -300,8 +558,8 @@ EIGEN_STRONG_INLINE Packet4cd preverse<Packet4cd>(const Packet4cd& a) {
    return PACKET_TYPE(pand(pdupreal(t).v, pdupimag(t).v));                                          \
  }

-EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet8cf);
-EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet4cd);
+EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(PacketXcf);
+EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(PacketXcd);

 // Binary ops that are needed on sub-packets for predux and predux_mul.
 #define EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PACKET_TYPE)                                 \
@@ -311,11 +569,17 @@ EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet4cd);
    return pmul_complex(a, b);                                                                    \
  }

-EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet8cf);
-EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cf);
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PacketXcf);
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
 EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cf);
-EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cd);
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cf);
+#endif
+EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PacketXcd);
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
 EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cd);
+#endif

 #define EIGEN_CLANG_PACKET_SCATTER_GATHER(PACKET_TYPE)                                                               \
  template <>                                                                                                        \
@@ -338,8 +602,8 @@ EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cd);
    return result;                                                                                                   \
  }

-EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8cf);
-EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet4cd);
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXcf);
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXcd);
 #undef EIGEN_CLANG_PACKET_SCATTER_GATHER

 #undef DELEGATE_BINARY_TO_REAL_OP
@@ -348,46 +612,89 @@ EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet4cd);

 // ------------ ternary ops -------------
 template <>
-EIGEN_STRONG_INLINE Packet8cf pselect<Packet8cf>(const Packet8cf& mask, const Packet8cf& a, const Packet8cf& b) {
-  return Packet8cf(reinterpret_cast<Packet16f>(
-      pselect(reinterpret_cast<Packet8d>(mask.v), reinterpret_cast<Packet8d>(a.v), reinterpret_cast<Packet8d>(b.v))));
+EIGEN_STRONG_INLINE PacketXcf pselect<PacketXcf>(const PacketXcf& mask, const PacketXcf& a, const PacketXcf& b) {
+  return PacketXcf(reinterpret_cast<PacketXf>(
+      pselect(reinterpret_cast<PacketXd>(mask.v), reinterpret_cast<PacketXd>(a.v), reinterpret_cast<PacketXd>(b.v))));
 }

+// --- zip_in_place for complex ---
 namespace detail {
+
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_ALWAYS_INLINE void zip_in_place<Packet8cf>(Packet8cf& p1, Packet8cf& p2) {
-  Packet16f tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23);
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcf>(PacketXcf& p1, PacketXcf& p2) {
+  PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 4, 5);
+  p2.v = __builtin_shufflevector(p1.v, p2.v, 2, 3, 6, 7);
+  p1.v = tmp;
+}
+// PacketXcd at 16 bytes has 1 element, no zip_in_place needed.
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcf>(PacketXcf& p1, PacketXcf& p2) {
+  PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11);
+  p2.v = __builtin_shufflevector(p1.v, p2.v, 4, 5, 12, 13, 6, 7, 14, 15);
+  p1.v = tmp;
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcd>(PacketXcd& p1, PacketXcd& p2) {
+  PacketXd tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 4, 5);
+  p2.v = __builtin_shufflevector(p1.v, p2.v, 2, 3, 6, 7);
+  p1.v = tmp;
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcf>(PacketXcf& p1, PacketXcf& p2) {
+  PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23);
  p2.v = __builtin_shufflevector(p1.v, p2.v, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31);
  p1.v = tmp;
 }

 template <>
-EIGEN_ALWAYS_INLINE void zip_in_place<Packet4cd>(Packet4cd& p1, Packet4cd& p2) {
-  Packet8d tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11);
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcd>(PacketXcd& p1, PacketXcd& p2) {
+  PacketXd tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11);
  p2.v = __builtin_shufflevector(p1.v, p2.v, 4, 5, 12, 13, 6, 7, 14, 15);
  p1.v = tmp;
 }
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
 }  // namespace detail

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8cf, 8>& kernel) {
+// --- ptranspose for complex ---
+// PacketXcf: valid block sizes depend on kComplexFloatSize.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcf, 2>& kernel) {
  detail::ptranspose_impl(kernel);
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8cf, 4>& kernel) {
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcf, 4>& kernel) {
  detail::ptranspose_impl(kernel);
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8cf, 2>& kernel) {
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcf, 8>& kernel) {
  detail::ptranspose_impl(kernel);
 }
+#endif

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4cd, 4>& kernel) {
+// PacketXcd: valid block sizes depend on kComplexDoubleSize.
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcd, 2>& kernel) {
  detail::ptranspose_impl(kernel);
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4cd, 2>& kernel) {
+#endif
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcd, 4>& kernel) {
  detail::ptranspose_impl(kernel);
 }
+#endif

-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf, Packet16f)
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd, Packet8d)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PacketXcf, PacketXf)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PacketXcd, PacketXd)

 }  // end namespace internal
 }  // end namespace Eigen
--- a/Eigen/src/Core/arch/clang/MathFunctions.h
+++ b/Eigen/src/Core/arch/clang/MathFunctions.h
@@ -18,27 +18,27 @@ namespace Eigen {
 namespace internal {

 template <>
-EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent) {
+EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent) {
  return pfrexp_generic(a, exponent);
 }

 template <>
-EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent) {
+EIGEN_STRONG_INLINE PacketXd pfrexp<PacketXd>(const PacketXd& a, PacketXd& exponent) {
  return pfrexp_generic(a, exponent);
 }

 template <>
-EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
+EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent) {
  return pldexp_generic(a, exponent);
 }

 template <>
-EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
+EIGEN_STRONG_INLINE PacketXd pldexp<PacketXd>(const PacketXd& a, const PacketXd& exponent) {
  return pldexp_generic(a, exponent);
 }

-EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet16f)
-EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet8d)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketXf)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketXd)

 }  // end namespace internal

--- a/Eigen/src/Core/arch/clang/PacketMath.h
+++ b/Eigen/src/Core/arch/clang/PacketMath.h
@@ -24,14 +24,32 @@ template <typename ScalarT, int n>
 using VectorType = ScalarT __attribute__((ext_vector_type(n), aligned(n * sizeof(ScalarT))));
 }  // namespace detail

-// --- Primary packet type definitions (fixed at 64 bytes) ---
+// --- Naming Convention ---
+// This backend uses size-independent type aliases so the same code works
+// for EIGEN_GENERIC_VECTOR_SIZE_BYTES in {16, 32, 64}:
+//
+//   PacketXf  - float vector   (4, 8, or 16 elements)
+//   PacketXd  - double vector  (2, 4, or 8 elements)
+//   PacketXi  - int32_t vector (4, 8, or 16 elements)
+//   PacketXl  - int64_t vector (2, 4, or 8 elements)
+//   PacketXcf - complex<float> vector  (2, 4, or 8 elements)  [in Complex.h]
+//   PacketXcd - complex<double> vector (1, 2, or 4 elements)  [in Complex.h]
+//
+// The "X" suffix indicates the element count is determined by the macro
+// EIGEN_GENERIC_VECTOR_SIZE_BYTES at compile time. Operations that require
+// compile-time constant indices (e.g. __builtin_shufflevector) use
+// #if EIGEN_GENERIC_VECTOR_SIZE_BYTES == ... blocks.

-// TODO(rmlarsen): Generalize to other vector sizes.
-static_assert(EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64, "We currently assume the full vector size is 64 bytes");
-using Packet16f = detail::VectorType<float, 16>;
-using Packet8d = detail::VectorType<double, 8>;
-using Packet16i = detail::VectorType<int32_t, 16>;
-using Packet8l = detail::VectorType<int64_t, 8>;
+static_assert(EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 || EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 ||
+                  EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64,
+              "EIGEN_GENERIC_VECTOR_SIZE_BYTES must be 16, 32, or 64");
+
+constexpr int kFloatPacketSize = EIGEN_GENERIC_VECTOR_SIZE_BYTES / sizeof(float);
+constexpr int kDoublePacketSize = EIGEN_GENERIC_VECTOR_SIZE_BYTES / sizeof(double);
+using PacketXf = detail::VectorType<float, kFloatPacketSize>;
+using PacketXd = detail::VectorType<double, kDoublePacketSize>;
+using PacketXi = detail::VectorType<int32_t, kFloatPacketSize>;
+using PacketXl = detail::VectorType<int64_t, kDoublePacketSize>;

 // --- packet_traits specializations ---
 struct generic_float_packet_traits : default_packet_traits {
@@ -82,20 +100,20 @@ struct generic_float_packet_traits : default_packet_traits {

 template <>
 struct packet_traits<float> : generic_float_packet_traits {
-  using type = Packet16f;
-  using half = Packet16f;
+  using type = PacketXf;
+  using half = PacketXf;
  enum {
-    size = 16,
+    size = kFloatPacketSize,
  };
 };

 template <>
 struct packet_traits<double> : generic_float_packet_traits {
-  using type = Packet8d;
-  using half = Packet8d;
+  using type = PacketXd;
+  using half = PacketXd;
  // Generic double-precision acos/asin are not yet implemented in
  // GenericPacketMathFunctions.h (only float versions exist).
-  enum { size = 8, HasACos = 0, HasASin = 0 };
+  enum { size = kDoublePacketSize, HasACos = 0, HasASin = 0 };
 };

 struct generic_integer_packet_traits : default_packet_traits {
@@ -131,19 +149,19 @@ struct generic_integer_packet_traits : default_packet_traits {

 template <>
 struct packet_traits<int32_t> : generic_integer_packet_traits {
-  using type = Packet16i;
-  using half = Packet16i;
+  using type = PacketXi;
+  using half = PacketXi;
  enum {
-    size = 16,
+    size = kFloatPacketSize,
  };
 };

 template <>
 struct packet_traits<int64_t> : generic_integer_packet_traits {
-  using type = Packet8l;
-  using half = Packet8l;
+  using type = PacketXl;
+  using half = PacketXl;
  enum {
-    size = 8,
+    size = kDoublePacketSize,
  };
 };

@@ -156,37 +174,37 @@ struct generic_unpacket_traits : default_unpacket_traits {
 };

 template <>
-struct unpacket_traits<Packet16f> : generic_unpacket_traits {
+struct unpacket_traits<PacketXf> : generic_unpacket_traits {
  using type = float;
-  using half = Packet16f;
-  using integer_packet = Packet16i;
+  using half = PacketXf;
+  using integer_packet = PacketXi;
  enum {
-    size = 16,
+    size = kFloatPacketSize,
  };
 };
 template <>
-struct unpacket_traits<Packet8d> : generic_unpacket_traits {
+struct unpacket_traits<PacketXd> : generic_unpacket_traits {
  using type = double;
-  using half = Packet8d;
-  using integer_packet = Packet8l;
+  using half = PacketXd;
+  using integer_packet = PacketXl;
  enum {
-    size = 8,
+    size = kDoublePacketSize,
  };
 };
 template <>
-struct unpacket_traits<Packet16i> : generic_unpacket_traits {
+struct unpacket_traits<PacketXi> : generic_unpacket_traits {
  using type = int32_t;
-  using half = Packet16i;
+  using half = PacketXi;
  enum {
-    size = 16,
+    size = kFloatPacketSize,
  };
 };
 template <>
-struct unpacket_traits<Packet8l> : generic_unpacket_traits {
+struct unpacket_traits<PacketXl> : generic_unpacket_traits {
  using type = int64_t;
-  using half = Packet8l;
+  using half = PacketXl;
  enum {
-    size = 8,
+    size = kDoublePacketSize,
  };
 };

@@ -265,21 +283,21 @@ EIGEN_STRONG_INLINE void store_vector_aligned(scalar_type_of_vector_t<VectorT>*
    detail::store_vector_aligned<PACKET_TYPE>(to, from);                                                          \
  }

-EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16f)
-EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8d)
-EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16i)
-EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8l)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXf)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXd)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXi)
+EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXl)
 #undef EIGEN_CLANG_PACKET_LOAD_STORE_PACKET

 // --- Broadcast operation ---
 template <>
-EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(uint32_t from) {
-  return Packet16f(numext::bit_cast<float>(from));
+EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(uint32_t from) {
+  return PacketXf(numext::bit_cast<float>(from));
 }

 template <>
-EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(uint64_t from) {
-  return Packet8d(numext::bit_cast<double>(from));
+EIGEN_STRONG_INLINE PacketXd pset1frombits<PacketXd>(uint64_t from) {
+  return PacketXd(numext::bit_cast<double>(from));
 }

 #define EIGEN_CLANG_PACKET_SET1(PACKET_TYPE)                                                            \
@@ -292,10 +310,10 @@ EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(uint64_t from) {
    return from[0];                                                                                     \
  }

-EIGEN_CLANG_PACKET_SET1(Packet16f)
-EIGEN_CLANG_PACKET_SET1(Packet8d)
-EIGEN_CLANG_PACKET_SET1(Packet16i)
-EIGEN_CLANG_PACKET_SET1(Packet8l)
+EIGEN_CLANG_PACKET_SET1(PacketXf)
+EIGEN_CLANG_PACKET_SET1(PacketXd)
+EIGEN_CLANG_PACKET_SET1(PacketXi)
+EIGEN_CLANG_PACKET_SET1(PacketXl)
 #undef EIGEN_CLANG_PACKET_SET1

 // --- Arithmetic operations ---
@@ -309,10 +327,10 @@ EIGEN_CLANG_PACKET_SET1(Packet8l)
    return -a;                                                                 \
  }

-EIGEN_CLANG_PACKET_ARITHMETIC(Packet16f)
-EIGEN_CLANG_PACKET_ARITHMETIC(Packet8d)
-EIGEN_CLANG_PACKET_ARITHMETIC(Packet16i)
-EIGEN_CLANG_PACKET_ARITHMETIC(Packet8l)
+EIGEN_CLANG_PACKET_ARITHMETIC(PacketXf)
+EIGEN_CLANG_PACKET_ARITHMETIC(PacketXd)
+EIGEN_CLANG_PACKET_ARITHMETIC(PacketXi)
+EIGEN_CLANG_PACKET_ARITHMETIC(PacketXl)
 #undef EIGEN_CLANG_PACKET_ARITHMETIC

 // --- Bitwise operations (via casting) ---
@@ -321,10 +339,10 @@ namespace detail {

 // Reinterpret-cast helpers, equivalent to preinterpret<> but defined here
 // because PacketMath.h is included before TypeCasting.h.
-EIGEN_STRONG_INLINE Packet16i preinterpret_float_to_int(const Packet16f& a) { return reinterpret_cast<Packet16i>(a); }
-EIGEN_STRONG_INLINE Packet16f preinterpret_int_to_float(const Packet16i& a) { return reinterpret_cast<Packet16f>(a); }
-EIGEN_STRONG_INLINE Packet8l preinterpret_double_to_long(const Packet8d& a) { return reinterpret_cast<Packet8l>(a); }
-EIGEN_STRONG_INLINE Packet8d preinterpret_long_to_double(const Packet8l& a) { return reinterpret_cast<Packet8d>(a); }
+EIGEN_STRONG_INLINE PacketXi preinterpret_float_to_int(const PacketXf& a) { return reinterpret_cast<PacketXi>(a); }
+EIGEN_STRONG_INLINE PacketXf preinterpret_int_to_float(const PacketXi& a) { return reinterpret_cast<PacketXf>(a); }
+EIGEN_STRONG_INLINE PacketXl preinterpret_double_to_long(const PacketXd& a) { return reinterpret_cast<PacketXl>(a); }
+EIGEN_STRONG_INLINE PacketXd preinterpret_long_to_double(const PacketXl& a) { return reinterpret_cast<PacketXd>(a); }

 }  // namespace detail

@@ -368,8 +386,8 @@ EIGEN_STRONG_INLINE Packet8d preinterpret_long_to_double(const Packet8l& a) { re
    return a << N;                                                                                   \
  }

-EIGEN_CLANG_PACKET_BITWISE_INT(Packet16i)
-EIGEN_CLANG_PACKET_BITWISE_INT(Packet8l)
+EIGEN_CLANG_PACKET_BITWISE_INT(PacketXi)
+EIGEN_CLANG_PACKET_BITWISE_INT(PacketXl)
 #undef EIGEN_CLANG_PACKET_BITWISE_INT

 // Bitwise ops for floating point packets
@@ -401,8 +419,8 @@ EIGEN_CLANG_PACKET_BITWISE_INT(Packet8l)
    return CAST_FROM_INT(CAST_TO_INT(a) & ~CAST_TO_INT(b));                                          \
  }

-EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet16f, detail::preinterpret_float_to_int, detail::preinterpret_int_to_float)
-EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet8d, detail::preinterpret_double_to_long, detail::preinterpret_long_to_double)
+EIGEN_CLANG_PACKET_BITWISE_FLOAT(PacketXf, detail::preinterpret_float_to_int, detail::preinterpret_int_to_float)
+EIGEN_CLANG_PACKET_BITWISE_FLOAT(PacketXd, detail::preinterpret_double_to_long, detail::preinterpret_long_to_double)
 #undef EIGEN_CLANG_PACKET_BITWISE_FLOAT

 // --- Comparison operations ---
@@ -428,8 +446,8 @@ EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet8d, detail::preinterpret_double_to_long,
    return numext::bit_cast<PACKET_TYPE>(INT_PACKET_TYPE(!(a >= b)));                                       \
  }

-EIGEN_CLANG_PACKET_CMP(Packet16f, Packet16i)
-EIGEN_CLANG_PACKET_CMP(Packet8d, Packet8l)
+EIGEN_CLANG_PACKET_CMP(PacketXf, PacketXi)
+EIGEN_CLANG_PACKET_CMP(PacketXd, PacketXl)
 #undef EIGEN_CLANG_PACKET_CMP

 // --- Min/Max operations ---
@@ -472,10 +490,10 @@ EIGEN_CLANG_PACKET_CMP(Packet8d, Packet8l)
    return mask != 0 ? a : b;                                                                                       \
  }

-EIGEN_CLANG_PACKET_ELEMENTWISE(Packet16f)
-EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8d)
-EIGEN_CLANG_PACKET_ELEMENTWISE(Packet16i)
-EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8l)
+EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXf)
+EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXd)
+EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXi)
+EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXl)
 #undef EIGEN_CLANG_PACKET_ELEMENTWISE
 #endif

@@ -510,8 +528,8 @@ EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8l)
    return __builtin_elementwise_sqrt(a);                                     \
  }

-EIGEN_CLANG_PACKET_MATH_FLOAT(Packet16f)
-EIGEN_CLANG_PACKET_MATH_FLOAT(Packet8d)
+EIGEN_CLANG_PACKET_MATH_FLOAT(PacketXf)
+EIGEN_CLANG_PACKET_MATH_FLOAT(PacketXd)
 #undef EIGEN_CLANG_PACKET_MATH_FLOAT
 #endif

@@ -563,8 +581,8 @@ EIGEN_CLANG_PACKET_MATH_FLOAT(Packet8d)
  }
 #endif

-EIGEN_CLANG_PACKET_MADD(Packet16f)
-EIGEN_CLANG_PACKET_MADD(Packet8d)
+EIGEN_CLANG_PACKET_MADD(PacketXf)
+EIGEN_CLANG_PACKET_MADD(PacketXd)
 #undef EIGEN_CLANG_PACKET_MADD

 #define EIGEN_CLANG_PACKET_SCATTER_GATHER(PACKET_TYPE)                                                               \
@@ -586,10 +604,10 @@ EIGEN_CLANG_PACKET_MADD(Packet8d)
    return result;                                                                                                   \
  }

-EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16f)
-EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8d)
-EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16i)
-EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8l)
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXf)
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXd)
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXi)
+EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXl)

 #undef EIGEN_CLANG_PACKET_SCATTER_GATHER

@@ -597,6 +615,14 @@ EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8l)
 #if EIGEN_HAS_BUILTIN(__builtin_shufflevector)
 namespace detail {
 template <typename Packet>
+EIGEN_STRONG_INLINE Packet preverse_impl_2(const Packet& a) {
+  return __builtin_shufflevector(a, a, 1, 0);
+}
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet preverse_impl_4(const Packet& a) {
+  return __builtin_shufflevector(a, a, 3, 2, 1, 0);
+}
+template <typename Packet>
 EIGEN_STRONG_INLINE Packet preverse_impl_8(const Packet& a) {
  return __builtin_shufflevector(a, a, 7, 6, 5, 4, 3, 2, 1, 0);
 }
@@ -606,33 +632,81 @@ EIGEN_STRONG_INLINE Packet preverse_impl_16(const Packet& a) {
 }
 }  // namespace detail

-#define EIGEN_CLANG_PACKET_REVERSE(PACKET_TYPE, SIZE)                           \
-  template <>                                                                   \
-  EIGEN_STRONG_INLINE PACKET_TYPE preverse<PACKET_TYPE>(const PACKET_TYPE& a) { \
-    return detail::preverse_impl_##SIZE(a);                                     \
-  }
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

-EIGEN_CLANG_PACKET_REVERSE(Packet16f, 16)
-EIGEN_CLANG_PACKET_REVERSE(Packet8d, 8)
-EIGEN_CLANG_PACKET_REVERSE(Packet16i, 16)
-EIGEN_CLANG_PACKET_REVERSE(Packet8l, 8)
-#undef EIGEN_CLANG_PACKET_REVERSE
+template <>
+EIGEN_STRONG_INLINE PacketXf preverse<PacketXf>(const PacketXf& a) {
+  return detail::preverse_impl_4(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd preverse<PacketXd>(const PacketXd& a) {
+  return detail::preverse_impl_2(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi preverse<PacketXi>(const PacketXi& a) {
+  return detail::preverse_impl_4(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl preverse<PacketXl>(const PacketXl& a) {
+  return detail::preverse_impl_2(a);
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preverse<PacketXf>(const PacketXf& a) {
+  return detail::preverse_impl_8(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd preverse<PacketXd>(const PacketXd& a) {
+  return detail::preverse_impl_4(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi preverse<PacketXi>(const PacketXi& a) {
+  return detail::preverse_impl_8(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl preverse<PacketXl>(const PacketXl& a) {
+  return detail::preverse_impl_4(a);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preverse<PacketXf>(const PacketXf& a) {
+  return detail::preverse_impl_16(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd preverse<PacketXd>(const PacketXd& a) {
+  return detail::preverse_impl_8(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi preverse<PacketXi>(const PacketXi& a) {
+  return detail::preverse_impl_16(a);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl preverse<PacketXl>(const PacketXl& a) {
+  return detail::preverse_impl_8(a);
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES

 namespace detail {
+
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet ploaddup16(const typename unpacket_traits<Packet>::type* from) {
+EIGEN_STRONG_INLINE Packet ploaddup2(const typename unpacket_traits<Packet>::type* from) {
  static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
  using HalfPacket = HalfPacket<Packet>;
  HalfPacket a = load_vector_unaligned<HalfPacket>(from);
-  return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
+  return __builtin_shufflevector(a, a, 0, 0);
 }

 template <typename Packet>
-EIGEN_STRONG_INLINE Packet ploadquad16(const typename unpacket_traits<Packet>::type* from) {
-  static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
-  using QuarterPacket = QuarterPacket<Packet>;
-  QuarterPacket a = load_vector_unaligned<QuarterPacket>(from);
-  return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3);
+EIGEN_STRONG_INLINE Packet ploaddup4(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
+  using HalfPacket = HalfPacket<Packet>;
+  HalfPacket a = load_vector_unaligned<HalfPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0, 1, 1);
 }

 template <typename Packet>
@@ -643,6 +717,22 @@ EIGEN_STRONG_INLINE Packet ploaddup8(const typename unpacket_traits<Packet>::typ
  return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3);
 }

+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploaddup16(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
+  using HalfPacket = HalfPacket<Packet>;
+  HalfPacket a = load_vector_unaligned<HalfPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploadquad4(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
+  using QuarterPacket = QuarterPacket<Packet>;
+  QuarterPacket a = load_vector_unaligned<QuarterPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0, 0, 0);
+}
+
 template <typename Packet>
 EIGEN_STRONG_INLINE Packet ploadquad8(const typename unpacket_traits<Packet>::type* from) {
  static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
@@ -651,84 +741,241 @@ EIGEN_STRONG_INLINE Packet ploadquad8(const typename unpacket_traits<Packet>::ty
  return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1);
 }

+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploadquad16(const typename unpacket_traits<Packet>::type* from) {
+  static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
+  using QuarterPacket = QuarterPacket<Packet>;
+  QuarterPacket a = load_vector_unaligned<QuarterPacket>(from);
+  return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3);
+}
+
 }  // namespace detail

-template <>
-EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
-  return detail::ploaddup16<Packet16f>(from);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
-  return detail::ploaddup8<Packet8d>(from);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16i ploaddup<Packet16i>(const int32_t* from) {
-  return detail::ploaddup16<Packet16i>(from);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8l ploaddup<Packet8l>(const int64_t* from) {
-  return detail::ploaddup8<Packet8l>(from);
-}
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16

 template <>
-EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
-  return detail::ploadquad16<Packet16f>(from);
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
+  return detail::ploaddup4<PacketXf>(from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
-  return detail::ploadquad8<Packet8d>(from);
+EIGEN_STRONG_INLINE PacketXd ploaddup<PacketXd>(const double* from) {
+  return detail::ploaddup2<PacketXd>(from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16i ploadquad<Packet16i>(const int32_t* from) {
-  return detail::ploadquad16<Packet16i>(from);
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const int32_t* from) {
+  return detail::ploaddup4<PacketXi>(from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8l ploadquad<Packet8l>(const int64_t* from) {
-  return detail::ploadquad8<Packet8l>(from);
+EIGEN_STRONG_INLINE PacketXl ploaddup<PacketXl>(const int64_t* from) {
+  return detail::ploaddup2<PacketXl>(from);
 }
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
+  return detail::ploadquad4<PacketXf>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const int32_t* from) {
+  return detail::ploadquad4<PacketXi>(from);
+}
+// No ploadquad for 2-element packets (PacketXd, PacketXl) at 16 bytes.
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32

 template <>
-EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) {
-  Packet16f x{a + 0.0f, a + 1.0f, a + 2.0f,  a + 3.0f,  a + 4.0f,  a + 5.0f,  a + 6.0f,  a + 7.0f,
-              a + 8.0f, a + 9.0f, a + 10.0f, a + 11.0f, a + 12.0f, a + 13.0f, a + 14.0f, a + 15.0f};
-  return x;
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
+  return detail::ploaddup8<PacketXf>(from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
-  return Packet8d{a + 0.0, a + 1.0, a + 2.0, a + 3.0, a + 4.0, a + 5.0, a + 6.0, a + 7.0};
+EIGEN_STRONG_INLINE PacketXd ploaddup<PacketXd>(const double* from) {
+  return detail::ploaddup4<PacketXd>(from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16i plset<Packet16i>(const int32_t& a) {
-  return Packet16i{a + 0, a + 1, a + 2,  a + 3,  a + 4,  a + 5,  a + 6,  a + 7,
-                   a + 8, a + 9, a + 10, a + 11, a + 12, a + 13, a + 14, a + 15};
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const int32_t* from) {
+  return detail::ploaddup8<PacketXi>(from);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8l plset<Packet8l>(const int64_t& a) {
-  return Packet8l{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7};
+EIGEN_STRONG_INLINE PacketXl ploaddup<PacketXl>(const int64_t* from) {
+  return detail::ploaddup4<PacketXl>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
+  return detail::ploadquad8<PacketXf>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd ploadquad<PacketXd>(const double* from) {
+  return detail::ploadquad4<PacketXd>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const int32_t* from) {
+  return detail::ploadquad8<PacketXi>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl ploadquad<PacketXl>(const int64_t* from) {
+  return detail::ploadquad4<PacketXl>(from);
 }

+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
 template <>
-EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /* unused */) {
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
+  return detail::ploaddup16<PacketXf>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd ploaddup<PacketXd>(const double* from) {
+  return detail::ploaddup8<PacketXd>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const int32_t* from) {
+  return detail::ploaddup16<PacketXi>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl ploaddup<PacketXl>(const int64_t* from) {
+  return detail::ploaddup8<PacketXl>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
+  return detail::ploadquad16<PacketXf>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd ploadquad<PacketXd>(const double* from) {
+  return detail::ploadquad8<PacketXd>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const int32_t* from) {
+  return detail::ploadquad16<PacketXi>(from);
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl ploadquad<PacketXl>(const int64_t* from) {
+  return detail::ploadquad8<PacketXl>(from);
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- plset ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
+  return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd plset<PacketXd>(const double& a) {
+  return PacketXd{a + 0.0, a + 1.0};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const int32_t& a) {
+  return PacketXi{a + 0, a + 1, a + 2, a + 3};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl plset<PacketXl>(const int64_t& a) {
+  return PacketXl{a + 0, a + 1};
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
+  return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f, a + 4.0f, a + 5.0f, a + 6.0f, a + 7.0f};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd plset<PacketXd>(const double& a) {
+  return PacketXd{a + 0.0, a + 1.0, a + 2.0, a + 3.0};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const int32_t& a) {
+  return PacketXi{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl plset<PacketXl>(const int64_t& a) {
+  return PacketXl{a + 0, a + 1, a + 2, a + 3};
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
+  return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f,  a + 3.0f,  a + 4.0f,  a + 5.0f,  a + 6.0f,  a + 7.0f,
+                  a + 8.0f, a + 9.0f, a + 10.0f, a + 11.0f, a + 12.0f, a + 13.0f, a + 14.0f, a + 15.0f};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd plset<PacketXd>(const double& a) {
+  return PacketXd{a + 0.0, a + 1.0, a + 2.0, a + 3.0, a + 4.0, a + 5.0, a + 6.0, a + 7.0};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const int32_t& a) {
+  return PacketXi{a + 0, a + 1, a + 2,  a + 3,  a + 4,  a + 5,  a + 6,  a + 7,
+                  a + 8, a + 9, a + 10, a + 11, a + 12, a + 13, a + 14, a + 15};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXl plset<PacketXl>(const int64_t& a) {
+  return PacketXl{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7};
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- peven_mask ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) {
  float kTrue = numext::bit_cast<float>(int32_t(-1));
  float kFalse = 0.0f;
-  return Packet16f{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse,
-                   kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+  return PacketXf{kTrue, kFalse, kTrue, kFalse};
 }
-
 template <>
-EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /* unused */) {
+EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) {
  double kTrue = numext::bit_cast<double>(int64_t(-1l));
  double kFalse = 0.0;
-  return Packet8d{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+  return PacketXd{kTrue, kFalse};
 }

+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) {
+  float kTrue = numext::bit_cast<float>(int32_t(-1));
+  float kFalse = 0.0f;
+  return PacketXf{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) {
+  double kTrue = numext::bit_cast<double>(int64_t(-1l));
+  double kFalse = 0.0;
+  return PacketXd{kTrue, kFalse, kTrue, kFalse};
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) {
+  float kTrue = numext::bit_cast<float>(int32_t(-1));
+  float kFalse = 0.0f;
+  return PacketXf{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse,
+                  kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+}
+template <>
+EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) {
+  double kTrue = numext::bit_cast<double>(int64_t(-1l));
+  double kFalse = 0.0;
+  return PacketXd{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
 // Helpers for ptranspose.
 namespace detail {

 template <typename Packet>
-EIGEN_ALWAYS_INLINE void zip_in_place16(Packet& p1, Packet& p2) {
-  Packet tmp = __builtin_shufflevector(p1, p2, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
-  p2 = __builtin_shufflevector(p1, p2, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+EIGEN_ALWAYS_INLINE void zip_in_place2(Packet& p1, Packet& p2) {
+  Packet tmp = __builtin_shufflevector(p1, p2, 0, 2);
+  p2 = __builtin_shufflevector(p1, p2, 1, 3);
+  p1 = tmp;
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void zip_in_place4(Packet& p1, Packet& p2) {
+  Packet tmp = __builtin_shufflevector(p1, p2, 0, 4, 1, 5);
+  p2 = __builtin_shufflevector(p1, p2, 2, 6, 3, 7);
  p1 = tmp;
 }

@@ -739,28 +986,68 @@ EIGEN_ALWAYS_INLINE void zip_in_place8(Packet& p1, Packet& p2) {
  p1 = tmp;
 }

+template <typename Packet>
+EIGEN_ALWAYS_INLINE void zip_in_place16(Packet& p1, Packet& p2) {
+  Packet tmp = __builtin_shufflevector(p1, p2, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+  p2 = __builtin_shufflevector(p1, p2, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+  p1 = tmp;
+}
+
 template <typename Packet>
 void zip_in_place(Packet& p1, Packet& p2);

+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
 template <>
-EIGEN_ALWAYS_INLINE void zip_in_place<Packet16f>(Packet16f& p1, Packet16f& p2) {
-  zip_in_place16(p1, p2);
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXf>(PacketXf& p1, PacketXf& p2) {
+  zip_in_place4(p1, p2);
 }
-
 template <>
-EIGEN_ALWAYS_INLINE void zip_in_place<Packet8d>(Packet8d& p1, Packet8d& p2) {
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXd>(PacketXd& p1, PacketXd& p2) {
+  zip_in_place2(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXi>(PacketXi& p1, PacketXi& p2) {
+  zip_in_place4(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXl>(PacketXl& p1, PacketXl& p2) {
+  zip_in_place2(p1, p2);
+}
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXf>(PacketXf& p1, PacketXf& p2) {
  zip_in_place8(p1, p2);
 }
-
 template <>
-EIGEN_ALWAYS_INLINE void zip_in_place<Packet16i>(Packet16i& p1, Packet16i& p2) {
-  zip_in_place16(p1, p2);
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXd>(PacketXd& p1, PacketXd& p2) {
+  zip_in_place4(p1, p2);
 }
-
 template <>
-EIGEN_ALWAYS_INLINE void zip_in_place<Packet8l>(Packet8l& p1, Packet8l& p2) {
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXi>(PacketXi& p1, PacketXi& p2) {
  zip_in_place8(p1, p2);
 }
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXl>(PacketXl& p1, PacketXl& p2) {
+  zip_in_place4(p1, p2);
+}
+#else   // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXf>(PacketXf& p1, PacketXf& p2) {
+  zip_in_place16(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXd>(PacketXd& p1, PacketXd& p2) {
+  zip_in_place8(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXi>(PacketXi& p1, PacketXi& p2) {
+  zip_in_place16(p1, p2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<PacketXl>(PacketXl& p1, PacketXl& p2) {
+  zip_in_place8(p1, p2);
+}
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES

 template <typename Packet>
 EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
@@ -812,62 +1099,69 @@ EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {

 }  // namespace detail

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
+// ptranspose overloads: only emit valid block sizes per vector size.
+// At 16 bytes: float has 4 elems, double has 2 elems.
+// At 32 bytes: float has 8 elems, double has 4 elems.
+// At 64 bytes: float has 16 elems, double has 8 elems.
+
+// All sizes support PacketBlock<PacketXf, 2> and PacketBlock<PacketXf, 4>.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 2>& kernel) {
  detail::ptranspose_impl(kernel);
 }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 8>& kernel) {
+// All sizes support PacketBlock<PacketXd, 2>.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXd, 2>& kernel) {
  detail::ptranspose_impl(kernel);
 }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
+// All sizes support PacketBlock<PacketXi, 2> and PacketBlock<PacketXi, 4>.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 2>& kernel) {
  detail::ptranspose_impl(kernel);
 }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 2>& kernel) {
+// All sizes support PacketBlock<PacketXl, 2>.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXl, 2>& kernel) {
  detail::ptranspose_impl(kernel);
 }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
+// 32+ bytes: float has 8+ elems, double has 4+ elems.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 8>& kernel) {
  detail::ptranspose_impl(kernel);
 }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXd, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXl, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+#endif

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
+// 64 bytes: float has 16 elems, double has 8 elems.
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 16>& kernel) {
  detail::ptranspose_impl(kernel);
 }
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8d, 2>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXd, 8>& kernel) {
  detail::ptranspose_impl(kernel);
 }
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 16>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 16>& kernel) {
  detail::ptranspose_impl(kernel);
 }
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 8>& kernel) {
-  detail::ptranspose_impl(kernel);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 4>& kernel) {
-  detail::ptranspose_impl(kernel);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 2>& kernel) {
-  detail::ptranspose_impl(kernel);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8l, 8>& kernel) {
-  detail::ptranspose_impl(kernel);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8l, 4>& kernel) {
-  detail::ptranspose_impl(kernel);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8l, 2>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXl, 8>& kernel) {
  detail::ptranspose_impl(kernel);
 }
 #endif
+#endif

 }  // end namespace internal
 }  // end namespace Eigen
--- a/Eigen/src/Core/arch/clang/Reductions.h
+++ b/Eigen/src/Core/arch/clang/Reductions.h
@@ -33,10 +33,10 @@ namespace internal {
    return __builtin_reduce_or(a != 0) != 0;                                                \
  }

-EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet16f)
-EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8d)
-EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet16i)
-EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8l)
+EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXf)
+EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXd)
+EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXi)
+EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXl)
 #undef EIGEN_CLANG_PACKET_REDUX_MINMAX
 #endif

@@ -52,13 +52,38 @@ EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8l)
  }

 // __builtin_reduce_{mul,add} are only defined for integer types.
-EIGEN_CLANG_PACKET_REDUX_INT(Packet16i)
-EIGEN_CLANG_PACKET_REDUX_INT(Packet8l)
+EIGEN_CLANG_PACKET_REDUX_INT(PacketXi)
+EIGEN_CLANG_PACKET_REDUX_INT(PacketXl)
 #undef EIGEN_CLANG_PACKET_REDUX_INT
 #endif

 #if EIGEN_HAS_BUILTIN(__builtin_shufflevector)
 namespace detail {
+
+// Reduction helpers for different vector sizes.
+// Each returns a pair of (even-sum, odd-sum) or (even-product, odd-product).
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd2(
+    const VectorT& a) {
+  return {a[0], a[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd4(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1) + __builtin_shufflevector(a, a, 2, 3);
+  return {t1[0], t1[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd8(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) + __builtin_shufflevector(a, a, 4, 5, 6, 7);
+  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) + __builtin_shufflevector(t1, t1, 2, 3);
+  return {t2[0], t2[1]};
+}
+
 template <typename VectorT>
 EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd16(
    const VectorT& a) {
@@ -70,10 +95,23 @@ EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_v
 }

 template <typename VectorT>
-EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd8(
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul2(
    const VectorT& a) {
-  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) + __builtin_shufflevector(a, a, 4, 5, 6, 7);
-  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) + __builtin_shufflevector(t1, t1, 2, 3);
+  return {a[0], a[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul4(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1) * __builtin_shufflevector(a, a, 2, 3);
+  return {t1[0], t1[1]};
+}
+
+template <typename VectorT>
+EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul8(
+    const VectorT& a) {
+  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) * __builtin_shufflevector(a, a, 4, 5, 6, 7);
+  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) * __builtin_shufflevector(t1, t1, 2, 3);
  return {t2[0], t2[1]};
 }

@@ -86,57 +124,188 @@ EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_v
  const auto t3 = __builtin_shufflevector(t2, t2, 0, 1) * __builtin_shufflevector(t2, t2, 2, 3);
  return {t3[0], t3[1]};
 }
-
-template <typename VectorT>
-EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul8(
-    const VectorT& a) {
-  const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) * __builtin_shufflevector(a, a, 4, 5, 6, 7);
-  const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) * __builtin_shufflevector(t1, t1, 2, 3);
-  return {t2[0], t2[1]};
-}
 }  // namespace detail

+// --- predux and predux_mul for float ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceAdd4(a);
+  return even + odd;
+}
+template <>
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceMul4(a);
+  return even * odd;
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceAdd8(a);
+  return even + odd;
+}
+template <>
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceMul8(a);
+  return even * odd;
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
  float even, odd;
  std::tie(even, odd) = detail::ReduceAdd16(a);
  return even + odd;
 }
 template <>
-EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
+  float even, odd;
+  std::tie(even, odd) = detail::ReduceMul16(a);
+  return even * odd;
+}
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux and predux_mul for double ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+template <>
+EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceAdd2(a);
+  return even + odd;
+}
+template <>
+EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceMul2(a);
+  return even * odd;
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceAdd4(a);
+  return even + odd;
+}
+template <>
+EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
+  double even, odd;
+  std::tie(even, odd) = detail::ReduceMul4(a);
+  return even * odd;
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
  double even, odd;
  std::tie(even, odd) = detail::ReduceAdd8(a);
  return even + odd;
 }
 template <>
-EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
-  float even, odd;
-  std::tie(even, odd) = detail::ReduceMul16(a);
-  return even * odd;
-}
-template <>
-EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
+EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
  double even, odd;
  std::tie(even, odd) = detail::ReduceMul8(a);
  return even * odd;
 }

+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux for complex<float> ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a) {
+EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
+  float re, im;
+  std::tie(re, im) = detail::ReduceAdd4(a.v);
+  return std::complex<float>(re, im);
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
+  float re, im;
+  std::tie(re, im) = detail::ReduceAdd8(a.v);
+  return std::complex<float>(re, im);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
  float re, im;
  std::tie(re, im) = detail::ReduceAdd16(a.v);
  return std::complex<float>(re, im);
 }

+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux for complex<double> ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a) {
+EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
+  // 1 complex double: just return it
+  return a[0];
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
+  double re, im;
+  std::tie(re, im) = detail::ReduceAdd4(a.v);
+  return std::complex<double>(re, im);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
  double re, im;
  std::tie(re, im) = detail::ReduceAdd8(a.v);
  return std::complex<double>(re, im);
 }

+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux_mul for complex<float> ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a) {
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
+  // 2 complex floats: just multiply them
+  return a[0] * a[1];
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
+  // 4 complex floats: split into 2+2, multiply, then scalar multiply
+  const Packet2cf lower2 = Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3));
+  const Packet2cf upper2 = Packet2cf(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7));
+  const Packet2cf prod2 = pmul<Packet2cf>(lower2, upper2);
+  return prod2[0] * prod2[1];
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
+  // 8 complex floats: 8->4->2->scalar
  const Packet4cf lower4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3, 4, 5, 6, 7));
  const Packet4cf upper4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 8, 9, 10, 11, 12, 13, 14, 15));
  const Packet4cf prod4 = pmul<Packet4cf>(lower4, upper4);
@@ -146,14 +315,38 @@ EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a
  return prod2[0] * prod2[1];
 }

+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
+// --- predux_mul for complex<double> ---
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
 template <>
-EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a) {
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
+  // 1 complex double: just return it
+  return a[0];
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
+  // 2 complex doubles: just multiply them
+  return a[0] * a[1];
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
+  // 4 complex doubles: split into 2+2, multiply, then scalar multiply
  const Packet2cd lower2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3));
  const Packet2cd upper2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7));
  const Packet2cd prod2 = pmul<Packet2cd>(lower2, upper2);
  return prod2[0] * prod2[1];
 }

+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
+
 #endif

 }  // end namespace internal
--- a/Eigen/src/Core/arch/clang/TypeCasting.h
+++ b/Eigen/src/Core/arch/clang/TypeCasting.h
@@ -20,56 +20,140 @@ namespace internal {
 // preinterpret
 //==============================================================================
 template <>
-EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet16i>(const Packet16i& a) {
-  return reinterpret_cast<Packet16f>(a);
+EIGEN_STRONG_INLINE PacketXf preinterpret<PacketXf, PacketXi>(const PacketXi& a) {
+  return reinterpret_cast<PacketXf>(a);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i, Packet16f>(const Packet16f& a) {
-  return reinterpret_cast<Packet16i>(a);
+EIGEN_STRONG_INLINE PacketXi preinterpret<PacketXi, PacketXf>(const PacketXf& a) {
+  return reinterpret_cast<PacketXi>(a);
 }

 template <>
-EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet8l>(const Packet8l& a) {
-  return reinterpret_cast<Packet8d>(a);
+EIGEN_STRONG_INLINE PacketXd preinterpret<PacketXd, PacketXl>(const PacketXl& a) {
+  return reinterpret_cast<PacketXd>(a);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8l preinterpret<Packet8l, Packet8d>(const Packet8d& a) {
-  return reinterpret_cast<Packet8l>(a);
+EIGEN_STRONG_INLINE PacketXl preinterpret<PacketXl, PacketXd>(const PacketXd& a) {
+  return reinterpret_cast<PacketXl>(a);
 }

 //==============================================================================
 // pcast
 //==============================================================================
 #if EIGEN_HAS_BUILTIN(__builtin_convertvector)
+// Float-to-int conversions: __builtin_convertvector has UB for NaN/inf/
+// out-of-range inputs. Replace NaN with 0 before converting so that
+// pldexp_fast (which may pass NaN exponents) doesn't trigger UB.
 template <>
-EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
-  return __builtin_convertvector(a, Packet16i);
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXf, PacketXi>(const PacketXf& a) {
+  const PacketXf safe = a == a ? a : PacketXf(0);
+  return __builtin_convertvector(safe, PacketXi);
 }
 template <>
-EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
-  return __builtin_convertvector(a, Packet16f);
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXi, PacketXf>(const PacketXi& a) {
+  return __builtin_convertvector(a, PacketXf);
 }

 template <>
-EIGEN_STRONG_INLINE Packet8l pcast<Packet8d, Packet8l>(const Packet8d& a) {
-  return __builtin_convertvector(a, Packet8l);
+EIGEN_STRONG_INLINE PacketXl pcast<PacketXd, PacketXl>(const PacketXd& a) {
+  const PacketXd safe = a == a ? a : PacketXd(0);
+  return __builtin_convertvector(safe, PacketXl);
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pcast<Packet8l, Packet8d>(const Packet8l& a) {
-  return __builtin_convertvector(a, Packet8d);
+EIGEN_STRONG_INLINE PacketXd pcast<PacketXl, PacketXd>(const PacketXl& a) {
+  return __builtin_convertvector(a, PacketXd);
 }

+// float -> double: converts lower half of floats to doubles
+// double -> float: converts two PacketXd to one PacketXf
+// int32 -> int64: converts lower half of int32s to int64s
+// int64 -> int32: converts two PacketXl to one PacketXi
+
+#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
+
+// float -> double: converts lower 2 floats to 2 doubles
+template <>
+EIGEN_STRONG_INLINE PacketXd pcast<PacketXf, PacketXd>(const PacketXf& a) {
+  using HalfFloat = detail::VectorType<float, 2>;
+  HalfFloat lo = __builtin_shufflevector(a, a, 0, 1);
+  return __builtin_convertvector(lo, PacketXd);
+}
+
+// double -> float: converts two PacketXd (2 doubles each) to one PacketXf (4 floats)
+template <>
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXd, PacketXf>(const PacketXd& a, const PacketXd& b) {
+  using HalfFloat = detail::VectorType<float, 2>;
+  HalfFloat lo = __builtin_convertvector(a, HalfFloat);
+  HalfFloat hi = __builtin_convertvector(b, HalfFloat);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3);
+}
+
+// int32 -> int64: converts lower 2 int32s to 2 int64s
+template <>
+EIGEN_STRONG_INLINE PacketXl pcast<PacketXi, PacketXl>(const PacketXi& a) {
+  using HalfInt = detail::VectorType<int32_t, 2>;
+  HalfInt lo = __builtin_shufflevector(a, a, 0, 1);
+  return __builtin_convertvector(lo, PacketXl);
+}
+
+// int64 -> int32: converts two PacketXl (2 int64s each) to one PacketXi (4 int32s)
+template <>
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXl, PacketXi>(const PacketXl& a, const PacketXl& b) {
+  using HalfInt = detail::VectorType<int32_t, 2>;
+  HalfInt lo = __builtin_convertvector(a, HalfInt);
+  HalfInt hi = __builtin_convertvector(b, HalfInt);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3);
+}
+
+#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
+
+// float -> double: converts lower 4 floats to 4 doubles
+template <>
+EIGEN_STRONG_INLINE PacketXd pcast<PacketXf, PacketXd>(const PacketXf& a) {
+  using HalfFloat = detail::VectorType<float, 4>;
+  HalfFloat lo = __builtin_shufflevector(a, a, 0, 1, 2, 3);
+  return __builtin_convertvector(lo, PacketXd);
+}
+
+// double -> float: converts two PacketXd (4 doubles each) to one PacketXf (8 floats)
+template <>
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXd, PacketXf>(const PacketXd& a, const PacketXd& b) {
+  using HalfFloat = detail::VectorType<float, 4>;
+  HalfFloat lo = __builtin_convertvector(a, HalfFloat);
+  HalfFloat hi = __builtin_convertvector(b, HalfFloat);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+// int32 -> int64: converts lower 4 int32s to 4 int64s
+template <>
+EIGEN_STRONG_INLINE PacketXl pcast<PacketXi, PacketXl>(const PacketXi& a) {
+  using HalfInt = detail::VectorType<int32_t, 4>;
+  HalfInt lo = __builtin_shufflevector(a, a, 0, 1, 2, 3);
+  return __builtin_convertvector(lo, PacketXl);
+}
+
+// int64 -> int32: converts two PacketXl (4 int64s each) to one PacketXi (8 int32s)
+template <>
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXl, PacketXi>(const PacketXl& a, const PacketXl& b) {
+  using HalfInt = detail::VectorType<int32_t, 4>;
+  HalfInt lo = __builtin_convertvector(a, HalfInt);
+  HalfInt hi = __builtin_convertvector(b, HalfInt);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+#else  // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
+
 // float -> double: converts lower 8 floats to 8 doubles
 template <>
-EIGEN_STRONG_INLINE Packet8d pcast<Packet16f, Packet8d>(const Packet16f& a) {
+EIGEN_STRONG_INLINE PacketXd pcast<PacketXf, PacketXd>(const PacketXf& a) {
  using HalfFloat = detail::VectorType<float, 8>;
  HalfFloat lo = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7);
-  return __builtin_convertvector(lo, Packet8d);
+  return __builtin_convertvector(lo, PacketXd);
 }

-// double -> float: converts two Packet8d to one Packet16f
+// double -> float: converts two PacketXd to one PacketXf
 template <>
-EIGEN_STRONG_INLINE Packet16f pcast<Packet8d, Packet16f>(const Packet8d& a, const Packet8d& b) {
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXd, PacketXf>(const PacketXd& a, const PacketXd& b) {
  using HalfFloat = detail::VectorType<float, 8>;
  HalfFloat lo = __builtin_convertvector(a, HalfFloat);
  HalfFloat hi = __builtin_convertvector(b, HalfFloat);
@@ -78,20 +162,22 @@ EIGEN_STRONG_INLINE Packet16f pcast<Packet8d, Packet16f>(const Packet8d& a, cons

 // int32 -> int64: converts lower 8 int32s to 8 int64s
 template <>
-EIGEN_STRONG_INLINE Packet8l pcast<Packet16i, Packet8l>(const Packet16i& a) {
+EIGEN_STRONG_INLINE PacketXl pcast<PacketXi, PacketXl>(const PacketXi& a) {
  using HalfInt = detail::VectorType<int32_t, 8>;
  HalfInt lo = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7);
-  return __builtin_convertvector(lo, Packet8l);
+  return __builtin_convertvector(lo, PacketXl);
 }

-// int64 -> int32: converts two Packet8l to one Packet16i
+// int64 -> int32: converts two PacketXl to one PacketXi
 template <>
-EIGEN_STRONG_INLINE Packet16i pcast<Packet8l, Packet16i>(const Packet8l& a, const Packet8l& b) {
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXl, PacketXi>(const PacketXl& a, const PacketXl& b) {
  using HalfInt = detail::VectorType<int32_t, 8>;
  HalfInt lo = __builtin_convertvector(a, HalfInt);
  HalfInt hi = __builtin_convertvector(b, HalfInt);
  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 }
+
+#endif  // EIGEN_GENERIC_VECTOR_SIZE_BYTES
 #endif

 }  // end namespace internal
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -186,6 +186,17 @@ ei_add_test(mixingtypes)
 ei_add_test(float_conversion)
 ei_add_test(io)
 ei_add_test(packetmath "-DEIGEN_FAST_MATH=1")
+# Generic clang vector backend tests for different vector sizes.
+include(CheckCXXSourceCompiles)
+check_cxx_source_compiles("
+  typedef float v4sf __attribute__((ext_vector_type(4)));
+  int main() { return __builtin_vectorelements(v4sf{}); }
+" COMPILER_SUPPORTS_VECTOR_EXTENSIONS)
+if(COMPILER_SUPPORTS_VECTOR_EXTENSIONS)
+  ei_add_test(packetmath_generic_16 "-DEIGEN_FAST_MATH=1")
+  ei_add_test(packetmath_generic_32 "-DEIGEN_FAST_MATH=1")
+  ei_add_test(packetmath_generic_64 "-DEIGEN_FAST_MATH=1")
+endif()
 ei_add_test(packet_segment)
 ei_add_test(vectorization_logic)
 ei_add_test(basicstuff)
--- a/test/packetmath_generic_16.cpp
+++ b/test/packetmath_generic_16.cpp
@@ -0,0 +1,4 @@
+// Force the generic clang vector backend with 16-byte vectors.
+#define EIGEN_VECTORIZE_GENERIC 1
+#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 16
+#include "packetmath.cpp"
--- a/test/packetmath_generic_32.cpp
+++ b/test/packetmath_generic_32.cpp
@@ -0,0 +1,4 @@
+// Force the generic clang vector backend with 32-byte vectors.
+#define EIGEN_VECTORIZE_GENERIC 1
+#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 32
+#include "packetmath.cpp"
--- a/test/packetmath_generic_64.cpp
+++ b/test/packetmath_generic_64.cpp
@@ -0,0 +1,4 @@
+// Force the generic clang vector backend with 64-byte vectors.
+#define EIGEN_VECTORIZE_GENERIC 1
+#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 64
+#include "packetmath.cpp"