Make clang generic vector backend support 16, 32, and 64-byte vectors

libeigen/eigen!2213

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
This commit is contained in:
Rasmus Munk Larsen
2026-02-25 08:50:47 -08:00
parent ea25ea52bb
commit 4fab38d798
9 changed files with 1249 additions and 346 deletions

View File

@@ -27,11 +27,21 @@ struct complex_packet_wrapper {
RealPacketT v;
};
using Packet8cf = complex_packet_wrapper<float, 8>;
using Packet4cf = complex_packet_wrapper<float, 4>;
// --- Primary complex packet aliases ---
constexpr int kComplexFloatSize = kFloatPacketSize / 2; // 2, 4, or 8
constexpr int kComplexDoubleSize = kDoublePacketSize / 2; // 1, 2, or 4
using PacketXcf = complex_packet_wrapper<float, kComplexFloatSize>;
using PacketXcd = complex_packet_wrapper<double, kComplexDoubleSize>;
// Sub-packet types needed for reductions at larger sizes.
// When PacketXcf IS already a given size, we skip the alias to avoid duplicates.
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
using Packet2cf = complex_packet_wrapper<float, 2>;
using Packet4cd = complex_packet_wrapper<double, 4>;
#endif
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
using Packet4cf = complex_packet_wrapper<float, 4>;
using Packet2cd = complex_packet_wrapper<double, 2>;
#endif
struct generic_complex_packet_traits : default_packet_traits {
enum {
@@ -58,39 +68,39 @@ struct generic_complex_packet_traits : default_packet_traits {
template <>
struct packet_traits<std::complex<float>> : generic_complex_packet_traits {
using type = Packet8cf;
using half = Packet8cf;
using type = PacketXcf;
using half = PacketXcf;
enum {
size = 8,
size = kComplexFloatSize,
};
};
template <>
struct unpacket_traits<Packet8cf> : generic_unpacket_traits {
struct unpacket_traits<PacketXcf> : generic_unpacket_traits {
using type = std::complex<float>;
using half = Packet8cf;
using as_real = Packet16f;
using half = PacketXcf;
using as_real = PacketXf;
enum {
size = 8,
size = kComplexFloatSize,
};
};
template <>
struct packet_traits<std::complex<double>> : generic_complex_packet_traits {
using type = Packet4cd;
using half = Packet4cd;
using type = PacketXcd;
using half = PacketXcd;
enum {
size = 4,
size = kComplexDoubleSize,
};
};
template <>
struct unpacket_traits<Packet4cd> : generic_unpacket_traits {
struct unpacket_traits<PacketXcd> : generic_unpacket_traits {
using type = std::complex<double>;
using half = Packet4cd;
using as_real = Packet8d;
using half = PacketXcd;
using as_real = PacketXd;
enum {
size = 4,
size = kComplexDoubleSize,
};
};
@@ -115,24 +125,58 @@ struct unpacket_traits<Packet4cd> : generic_unpacket_traits {
pstore(&numext::real_ref(*to), from.v); \
}
EIGEN_CLANG_COMPLEX_LOAD_STORE(Packet8cf);
EIGEN_CLANG_COMPLEX_LOAD_STORE(Packet4cd);
EIGEN_CLANG_COMPLEX_LOAD_STORE(PacketXcf);
EIGEN_CLANG_COMPLEX_LOAD_STORE(PacketXcd);
#undef EIGEN_CLANG_COMPLEX_LOAD_STORE
template <>
EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from) {
const float re = numext::real(from);
const float im = numext::imag(from);
return Packet8cf(Packet16f{re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im});
}
// --- pset1 for complex ---
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from) {
EIGEN_STRONG_INLINE PacketXcf pset1<PacketXcf>(const std::complex<float>& from) {
const float re = numext::real(from);
const float im = numext::imag(from);
return PacketXcf(PacketXf{re, im, re, im});
}
template <>
EIGEN_STRONG_INLINE PacketXcd pset1<PacketXcd>(const std::complex<double>& from) {
const double re = numext::real(from);
const double im = numext::imag(from);
return Packet4cd(Packet8d{re, im, re, im, re, im, re, im});
return PacketXcd(PacketXd{re, im});
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE PacketXcf pset1<PacketXcf>(const std::complex<float>& from) {
const float re = numext::real(from);
const float im = numext::imag(from);
return PacketXcf(PacketXf{re, im, re, im, re, im, re, im});
}
template <>
EIGEN_STRONG_INLINE PacketXcd pset1<PacketXcd>(const std::complex<double>& from) {
const double re = numext::real(from);
const double im = numext::imag(from);
return PacketXcd(PacketXd{re, im, re, im});
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE PacketXcf pset1<PacketXcf>(const std::complex<float>& from) {
const float re = numext::real(from);
const float im = numext::imag(from);
return PacketXcf(PacketXf{re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im});
}
template <>
EIGEN_STRONG_INLINE PacketXcd pset1<PacketXcd>(const std::complex<double>& from) {
const double re = numext::real(from);
const double im = numext::imag(from);
return PacketXcd(PacketXd{re, im, re, im, re, im, re, im});
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// ----------- Unary ops ------------------
#define DELEGATE_UNARY_TO_REAL_OP(PACKET_TYPE, OP) \
template <> \
@@ -149,134 +193,348 @@ EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from)
} \
EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(PACKET_TYPE)
EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(Packet8cf);
EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(Packet4cd);
EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(PacketXcf);
EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(PacketXcd);
// --- pconj ---
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE Packet8cf pconj<Packet8cf>(const Packet8cf& a) {
return Packet8cf(__builtin_shufflevector(a.v, -a.v, 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31));
EIGEN_STRONG_INLINE PacketXcf pconj<PacketXcf>(const PacketXcf& a) {
return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
}
template <>
EIGEN_STRONG_INLINE PacketXcd pconj<PacketXcd>(const PacketXcd& a) {
return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 3));
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE PacketXcf pconj<PacketXcf>(const PacketXcf& a) {
return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
}
template <>
EIGEN_STRONG_INLINE PacketXcd pconj<PacketXcd>(const PacketXcd& a) {
return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE PacketXcf pconj<PacketXcf>(const PacketXcf& a) {
return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31));
}
template <>
EIGEN_STRONG_INLINE PacketXcd pconj<PacketXcd>(const PacketXcd& a) {
return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// Sub-packet pconj specializations needed for reductions.
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
template <>
EIGEN_STRONG_INLINE Packet2cf pconj<Packet2cf>(const Packet2cf& a) {
return Packet2cf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
}
#endif
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
template <>
EIGEN_STRONG_INLINE Packet4cf pconj<Packet4cf>(const Packet4cf& a) {
return Packet4cf(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pconj<Packet2cf>(const Packet2cf& a) {
return Packet2cf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
}
template <>
EIGEN_STRONG_INLINE Packet4cd pconj<Packet4cd>(const Packet4cd& a) {
return Packet4cd(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15));
}
template <>
EIGEN_STRONG_INLINE Packet2cd pconj<Packet2cd>(const Packet2cd& a) {
return Packet2cd(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7));
}
#endif
#undef DELEGATE_UNARY_TO_REAL_OP
#undef EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS
// Flip real and imaginary parts, i.e. {re(a), im(a)} -> {im(a), re(a)}.
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& a) {
return Packet8cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
EIGEN_STRONG_INLINE PacketXcf pcplxflip<PacketXcf>(const PacketXcf& a) {
return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
}
template <>
EIGEN_STRONG_INLINE PacketXcd pcplxflip<PacketXcd>(const PacketXcd& a) {
return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0));
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE PacketXcf pcplxflip<PacketXcf>(const PacketXcf& a) {
return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
}
template <>
EIGEN_STRONG_INLINE PacketXcd pcplxflip<PacketXcd>(const PacketXcd& a) {
return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE PacketXcf pcplxflip<PacketXcf>(const PacketXcf& a) {
return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
}
template <>
EIGEN_STRONG_INLINE PacketXcd pcplxflip<PacketXcd>(const PacketXcd& a) {
return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// Sub-packet pcplxflip specializations needed for reductions.
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
template <>
EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
}
#endif
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
template <>
EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& a) {
return Packet4cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
}
template <>
EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& a) {
return Packet4cd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6));
}
template <>
EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& a) {
return Packet2cd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2));
}
#endif
// Copy real to imaginary part, i.e. {re(a), im(a)} -> {re(a), re(a)}.
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE Packet8cf pdupreal<Packet8cf>(const Packet8cf& a) {
return Packet8cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14));
EIGEN_STRONG_INLINE PacketXcf pdupreal<PacketXcf>(const PacketXcf& a) {
return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
}
template <>
EIGEN_STRONG_INLINE PacketXcd pdupreal<PacketXcd>(const PacketXcd& a) {
return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0));
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE PacketXcf pdupreal<PacketXcf>(const PacketXcf& a) {
return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
}
template <>
EIGEN_STRONG_INLINE PacketXcd pdupreal<PacketXcd>(const PacketXcd& a) {
return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE PacketXcf pdupreal<PacketXcf>(const PacketXcf& a) {
return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14));
}
template <>
EIGEN_STRONG_INLINE PacketXcd pdupreal<PacketXcd>(const PacketXcd& a) {
return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// Sub-packet pdupreal specializations needed for reductions.
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
template <>
EIGEN_STRONG_INLINE Packet2cf pdupreal<Packet2cf>(const Packet2cf& a) {
return Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
}
#endif
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
template <>
EIGEN_STRONG_INLINE Packet4cf pdupreal<Packet4cf>(const Packet4cf& a) {
return Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pdupreal<Packet2cf>(const Packet2cf& a) {
return Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
}
template <>
EIGEN_STRONG_INLINE Packet4cd pdupreal<Packet4cd>(const Packet4cd& a) {
return Packet4cd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6));
}
template <>
EIGEN_STRONG_INLINE Packet2cd pdupreal<Packet2cd>(const Packet2cd& a) {
return Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2));
}
#endif
// Copy imaginary to real part, i.e. {re(a), im(a)} -> {im(a), im(a)}.
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE Packet8cf pdupimag<Packet8cf>(const Packet8cf& a) {
return Packet8cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15));
EIGEN_STRONG_INLINE PacketXcf pdupimag<PacketXcf>(const PacketXcf& a) {
return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
}
template <>
EIGEN_STRONG_INLINE PacketXcd pdupimag<PacketXcd>(const PacketXcd& a) {
return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1));
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE PacketXcf pdupimag<PacketXcf>(const PacketXcf& a) {
return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
}
template <>
EIGEN_STRONG_INLINE PacketXcd pdupimag<PacketXcd>(const PacketXcd& a) {
return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE PacketXcf pdupimag<PacketXcf>(const PacketXcf& a) {
return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15));
}
template <>
EIGEN_STRONG_INLINE PacketXcd pdupimag<PacketXcd>(const PacketXcd& a) {
return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// Sub-packet pdupimag specializations needed for reductions.
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
template <>
EIGEN_STRONG_INLINE Packet2cf pdupimag<Packet2cf>(const Packet2cf& a) {
return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
}
#endif
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
template <>
EIGEN_STRONG_INLINE Packet4cf pdupimag<Packet4cf>(const Packet4cf& a) {
return Packet4cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pdupimag<Packet2cf>(const Packet2cf& a) {
return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
}
template <>
EIGEN_STRONG_INLINE Packet4cd pdupimag<Packet4cd>(const Packet4cd& a) {
return Packet4cd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7));
}
template <>
EIGEN_STRONG_INLINE Packet2cd pdupimag<Packet2cd>(const Packet2cd& a) {
return Packet2cd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3));
}
#endif
// --- ploaddup ---
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from) {
return Packet8cf(Packet16f{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
std::real(from[2]), std::imag(from[2]), std::real(from[2]), std::imag(from[2]),
std::real(from[3]), std::imag(from[3]), std::real(from[3]), std::imag(from[3])});
EIGEN_STRONG_INLINE PacketXcf ploaddup<PacketXcf>(const std::complex<float>* from) {
return pset1<PacketXcf>(*from);
}
template <>
EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
return Packet4cd(Packet8d{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
EIGEN_STRONG_INLINE PacketXcd ploaddup<PacketXcd>(const std::complex<double>* from) {
return pset1<PacketXcd>(*from);
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE PacketXcf ploaddup<PacketXcf>(const std::complex<float>* from) {
return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
}
template <>
EIGEN_STRONG_INLINE PacketXcd ploaddup<PacketXcd>(const std::complex<double>* from) {
return pset1<PacketXcd>(*from);
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE PacketXcf ploaddup<PacketXcf>(const std::complex<float>* from) {
return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
std::real(from[2]), std::imag(from[2]), std::real(from[2]), std::imag(from[2]),
std::real(from[3]), std::imag(from[3]), std::real(from[3]), std::imag(from[3])});
}
template <>
EIGEN_STRONG_INLINE PacketXcd ploaddup<PacketXcd>(const std::complex<double>* from) {
return PacketXcd(PacketXd{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
}
template <>
EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from) {
return Packet8cf(Packet16f{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
}
template <>
EIGEN_STRONG_INLINE Packet4cd ploadquad<Packet4cd>(const std::complex<double>* from) {
return pset1<Packet4cd>(*from);
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// --- ploadquad ---
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE Packet8cf preverse<Packet8cf>(const Packet8cf& a) {
return Packet8cf(reinterpret_cast<Packet16f>(preverse(reinterpret_cast<Packet8d>(a.v))));
EIGEN_STRONG_INLINE PacketXcf ploadquad<PacketXcf>(const std::complex<float>* from) {
return pset1<PacketXcf>(*from);
}
template <>
EIGEN_STRONG_INLINE Packet4cd preverse<Packet4cd>(const Packet4cd& a) {
return Packet4cd(__builtin_shufflevector(a.v, a.v, 6, 7, 4, 5, 2, 3, 0, 1));
EIGEN_STRONG_INLINE PacketXcd ploadquad<PacketXcd>(const std::complex<double>* from) {
return pset1<PacketXcd>(*from);
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE PacketXcf ploadquad<PacketXcf>(const std::complex<float>* from) {
return pset1<PacketXcf>(*from);
}
template <>
EIGEN_STRONG_INLINE PacketXcd ploadquad<PacketXcd>(const std::complex<double>* from) {
return pset1<PacketXcd>(*from);
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE PacketXcf ploadquad<PacketXcf>(const std::complex<float>* from) {
return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]),
std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]),
std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])});
}
template <>
EIGEN_STRONG_INLINE PacketXcd ploadquad<PacketXcd>(const std::complex<double>* from) {
return pset1<PacketXcd>(*from);
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// --- preverse ---
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE PacketXcf preverse<PacketXcf>(const PacketXcf& a) {
// 2 complex floats: swap pairs (0,1) and (2,3)
return PacketXcf(__builtin_shufflevector(a.v, a.v, 2, 3, 0, 1));
}
template <>
EIGEN_STRONG_INLINE PacketXcd preverse<PacketXcd>(const PacketXcd& a) {
// 1 complex double: identity
return a;
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE PacketXcf preverse<PacketXcf>(const PacketXcf& a) {
// 4 complex floats: reverse pairs
return PacketXcf(reinterpret_cast<PacketXf>(preverse(reinterpret_cast<PacketXd>(a.v))));
}
template <>
EIGEN_STRONG_INLINE PacketXcd preverse<PacketXcd>(const PacketXcd& a) {
// 2 complex doubles: swap pairs
return PacketXcd(__builtin_shufflevector(a.v, a.v, 2, 3, 0, 1));
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE PacketXcf preverse<PacketXcf>(const PacketXcf& a) {
return PacketXcf(reinterpret_cast<PacketXf>(preverse(reinterpret_cast<PacketXd>(a.v))));
}
template <>
EIGEN_STRONG_INLINE PacketXcd preverse<PacketXcd>(const PacketXcd& a) {
return PacketXcd(__builtin_shufflevector(a.v, a.v, 6, 7, 4, 5, 2, 3, 0, 1));
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// ----------- Binary ops ------------------
#define DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, OP) \
template <> \
@@ -300,8 +558,8 @@ EIGEN_STRONG_INLINE Packet4cd preverse<Packet4cd>(const Packet4cd& a) {
return PACKET_TYPE(pand(pdupreal(t).v, pdupimag(t).v)); \
}
EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet8cf);
EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet4cd);
EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(PacketXcf);
EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(PacketXcd);
// Binary ops that are needed on sub-packets for predux and predux_mul.
#define EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PACKET_TYPE) \
@@ -311,11 +569,17 @@ EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet4cd);
return pmul_complex(a, b); \
}
EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet8cf);
EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cf);
EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PacketXcf);
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cf);
EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cd);
#endif
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cf);
#endif
EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PacketXcd);
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cd);
#endif
#define EIGEN_CLANG_PACKET_SCATTER_GATHER(PACKET_TYPE) \
template <> \
@@ -338,8 +602,8 @@ EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cd);
return result; \
}
EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8cf);
EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet4cd);
EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXcf);
EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXcd);
#undef EIGEN_CLANG_PACKET_SCATTER_GATHER
#undef DELEGATE_BINARY_TO_REAL_OP
@@ -348,46 +612,89 @@ EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet4cd);
// ------------ ternary ops -------------
template <>
EIGEN_STRONG_INLINE Packet8cf pselect<Packet8cf>(const Packet8cf& mask, const Packet8cf& a, const Packet8cf& b) {
return Packet8cf(reinterpret_cast<Packet16f>(
pselect(reinterpret_cast<Packet8d>(mask.v), reinterpret_cast<Packet8d>(a.v), reinterpret_cast<Packet8d>(b.v))));
EIGEN_STRONG_INLINE PacketXcf pselect<PacketXcf>(const PacketXcf& mask, const PacketXcf& a, const PacketXcf& b) {
return PacketXcf(reinterpret_cast<PacketXf>(
pselect(reinterpret_cast<PacketXd>(mask.v), reinterpret_cast<PacketXd>(a.v), reinterpret_cast<PacketXd>(b.v))));
}
// --- zip_in_place for complex ---
namespace detail {
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet8cf>(Packet8cf& p1, Packet8cf& p2) {
Packet16f tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23);
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcf>(PacketXcf& p1, PacketXcf& p2) {
PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 4, 5);
p2.v = __builtin_shufflevector(p1.v, p2.v, 2, 3, 6, 7);
p1.v = tmp;
}
// PacketXcd at 16 bytes has 1 element, no zip_in_place needed.
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcf>(PacketXcf& p1, PacketXcf& p2) {
PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11);
p2.v = __builtin_shufflevector(p1.v, p2.v, 4, 5, 12, 13, 6, 7, 14, 15);
p1.v = tmp;
}
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcd>(PacketXcd& p1, PacketXcd& p2) {
PacketXd tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 4, 5);
p2.v = __builtin_shufflevector(p1.v, p2.v, 2, 3, 6, 7);
p1.v = tmp;
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcf>(PacketXcf& p1, PacketXcf& p2) {
PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23);
p2.v = __builtin_shufflevector(p1.v, p2.v, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31);
p1.v = tmp;
}
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet4cd>(Packet4cd& p1, Packet4cd& p2) {
Packet8d tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11);
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXcd>(PacketXcd& p1, PacketXcd& p2) {
PacketXd tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11);
p2.v = __builtin_shufflevector(p1.v, p2.v, 4, 5, 12, 13, 6, 7, 14, 15);
p1.v = tmp;
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
} // namespace detail
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8cf, 8>& kernel) {
// --- ptranspose for complex ---
// PacketXcf: valid block sizes depend on kComplexFloatSize.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcf, 2>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8cf, 4>& kernel) {
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcf, 4>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8cf, 2>& kernel) {
#endif
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcf, 8>& kernel) {
detail::ptranspose_impl(kernel);
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4cd, 4>& kernel) {
// PacketXcd: valid block sizes depend on kComplexDoubleSize.
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcd, 2>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4cd, 2>& kernel) {
#endif
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXcd, 4>& kernel) {
detail::ptranspose_impl(kernel);
}
#endif
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf, Packet16f)
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd, Packet8d)
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PacketXcf, PacketXf)
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PacketXcd, PacketXd)
} // end namespace internal
} // end namespace Eigen

View File

@@ -18,27 +18,27 @@ namespace Eigen {
namespace internal {
template <>
EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent) {
EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent) {
return pfrexp_generic(a, exponent);
}
template <>
EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent) {
EIGEN_STRONG_INLINE PacketXd pfrexp<PacketXd>(const PacketXd& a, PacketXd& exponent) {
return pfrexp_generic(a, exponent);
}
template <>
EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent) {
return pldexp_generic(a, exponent);
}
template <>
EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
EIGEN_STRONG_INLINE PacketXd pldexp<PacketXd>(const PacketXd& a, const PacketXd& exponent) {
return pldexp_generic(a, exponent);
}
EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet16f)
EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet8d)
EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketXf)
EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketXd)
} // end namespace internal

View File

@@ -24,14 +24,32 @@ template <typename ScalarT, int n>
using VectorType = ScalarT __attribute__((ext_vector_type(n), aligned(n * sizeof(ScalarT))));
} // namespace detail
// --- Primary packet type definitions (fixed at 64 bytes) ---
// --- Naming Convention ---
// This backend uses size-independent type aliases so the same code works
// for EIGEN_GENERIC_VECTOR_SIZE_BYTES in {16, 32, 64}:
//
// PacketXf - float vector (4, 8, or 16 elements)
// PacketXd - double vector (2, 4, or 8 elements)
// PacketXi - int32_t vector (4, 8, or 16 elements)
// PacketXl - int64_t vector (2, 4, or 8 elements)
// PacketXcf - complex<float> vector (2, 4, or 8 elements) [in Complex.h]
// PacketXcd - complex<double> vector (1, 2, or 4 elements) [in Complex.h]
//
// The "X" suffix indicates the element count is determined by the macro
// EIGEN_GENERIC_VECTOR_SIZE_BYTES at compile time. Operations that require
// compile-time constant indices (e.g. __builtin_shufflevector) use
// #if EIGEN_GENERIC_VECTOR_SIZE_BYTES == ... blocks.
// TODO(rmlarsen): Generalize to other vector sizes.
static_assert(EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64, "We currently assume the full vector size is 64 bytes");
using Packet16f = detail::VectorType<float, 16>;
using Packet8d = detail::VectorType<double, 8>;
using Packet16i = detail::VectorType<int32_t, 16>;
using Packet8l = detail::VectorType<int64_t, 8>;
static_assert(EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 || EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 ||
EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64,
"EIGEN_GENERIC_VECTOR_SIZE_BYTES must be 16, 32, or 64");
constexpr int kFloatPacketSize = EIGEN_GENERIC_VECTOR_SIZE_BYTES / sizeof(float);
constexpr int kDoublePacketSize = EIGEN_GENERIC_VECTOR_SIZE_BYTES / sizeof(double);
using PacketXf = detail::VectorType<float, kFloatPacketSize>;
using PacketXd = detail::VectorType<double, kDoublePacketSize>;
using PacketXi = detail::VectorType<int32_t, kFloatPacketSize>;
using PacketXl = detail::VectorType<int64_t, kDoublePacketSize>;
// --- packet_traits specializations ---
struct generic_float_packet_traits : default_packet_traits {
@@ -82,20 +100,20 @@ struct generic_float_packet_traits : default_packet_traits {
template <>
struct packet_traits<float> : generic_float_packet_traits {
using type = Packet16f;
using half = Packet16f;
using type = PacketXf;
using half = PacketXf;
enum {
size = 16,
size = kFloatPacketSize,
};
};
template <>
struct packet_traits<double> : generic_float_packet_traits {
using type = Packet8d;
using half = Packet8d;
using type = PacketXd;
using half = PacketXd;
// Generic double-precision acos/asin are not yet implemented in
// GenericPacketMathFunctions.h (only float versions exist).
enum { size = 8, HasACos = 0, HasASin = 0 };
enum { size = kDoublePacketSize, HasACos = 0, HasASin = 0 };
};
struct generic_integer_packet_traits : default_packet_traits {
@@ -131,19 +149,19 @@ struct generic_integer_packet_traits : default_packet_traits {
template <>
struct packet_traits<int32_t> : generic_integer_packet_traits {
using type = Packet16i;
using half = Packet16i;
using type = PacketXi;
using half = PacketXi;
enum {
size = 16,
size = kFloatPacketSize,
};
};
template <>
struct packet_traits<int64_t> : generic_integer_packet_traits {
using type = Packet8l;
using half = Packet8l;
using type = PacketXl;
using half = PacketXl;
enum {
size = 8,
size = kDoublePacketSize,
};
};
@@ -156,37 +174,37 @@ struct generic_unpacket_traits : default_unpacket_traits {
};
template <>
struct unpacket_traits<Packet16f> : generic_unpacket_traits {
struct unpacket_traits<PacketXf> : generic_unpacket_traits {
using type = float;
using half = Packet16f;
using integer_packet = Packet16i;
using half = PacketXf;
using integer_packet = PacketXi;
enum {
size = 16,
size = kFloatPacketSize,
};
};
template <>
struct unpacket_traits<Packet8d> : generic_unpacket_traits {
struct unpacket_traits<PacketXd> : generic_unpacket_traits {
using type = double;
using half = Packet8d;
using integer_packet = Packet8l;
using half = PacketXd;
using integer_packet = PacketXl;
enum {
size = 8,
size = kDoublePacketSize,
};
};
template <>
struct unpacket_traits<Packet16i> : generic_unpacket_traits {
struct unpacket_traits<PacketXi> : generic_unpacket_traits {
using type = int32_t;
using half = Packet16i;
using half = PacketXi;
enum {
size = 16,
size = kFloatPacketSize,
};
};
template <>
struct unpacket_traits<Packet8l> : generic_unpacket_traits {
struct unpacket_traits<PacketXl> : generic_unpacket_traits {
using type = int64_t;
using half = Packet8l;
using half = PacketXl;
enum {
size = 8,
size = kDoublePacketSize,
};
};
@@ -265,21 +283,21 @@ EIGEN_STRONG_INLINE void store_vector_aligned(scalar_type_of_vector_t<VectorT>*
detail::store_vector_aligned<PACKET_TYPE>(to, from); \
}
EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16f)
EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8d)
EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16i)
EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8l)
EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXf)
EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXd)
EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXi)
EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXl)
#undef EIGEN_CLANG_PACKET_LOAD_STORE_PACKET
// --- Broadcast operation ---
template <>
EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(uint32_t from) {
return Packet16f(numext::bit_cast<float>(from));
EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(uint32_t from) {
return PacketXf(numext::bit_cast<float>(from));
}
template <>
EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(uint64_t from) {
return Packet8d(numext::bit_cast<double>(from));
EIGEN_STRONG_INLINE PacketXd pset1frombits<PacketXd>(uint64_t from) {
return PacketXd(numext::bit_cast<double>(from));
}
#define EIGEN_CLANG_PACKET_SET1(PACKET_TYPE) \
@@ -292,10 +310,10 @@ EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(uint64_t from) {
return from[0]; \
}
EIGEN_CLANG_PACKET_SET1(Packet16f)
EIGEN_CLANG_PACKET_SET1(Packet8d)
EIGEN_CLANG_PACKET_SET1(Packet16i)
EIGEN_CLANG_PACKET_SET1(Packet8l)
EIGEN_CLANG_PACKET_SET1(PacketXf)
EIGEN_CLANG_PACKET_SET1(PacketXd)
EIGEN_CLANG_PACKET_SET1(PacketXi)
EIGEN_CLANG_PACKET_SET1(PacketXl)
#undef EIGEN_CLANG_PACKET_SET1
// --- Arithmetic operations ---
@@ -309,10 +327,10 @@ EIGEN_CLANG_PACKET_SET1(Packet8l)
return -a; \
}
EIGEN_CLANG_PACKET_ARITHMETIC(Packet16f)
EIGEN_CLANG_PACKET_ARITHMETIC(Packet8d)
EIGEN_CLANG_PACKET_ARITHMETIC(Packet16i)
EIGEN_CLANG_PACKET_ARITHMETIC(Packet8l)
EIGEN_CLANG_PACKET_ARITHMETIC(PacketXf)
EIGEN_CLANG_PACKET_ARITHMETIC(PacketXd)
EIGEN_CLANG_PACKET_ARITHMETIC(PacketXi)
EIGEN_CLANG_PACKET_ARITHMETIC(PacketXl)
#undef EIGEN_CLANG_PACKET_ARITHMETIC
// --- Bitwise operations (via casting) ---
@@ -321,10 +339,10 @@ namespace detail {
// Reinterpret-cast helpers, equivalent to preinterpret<> but defined here
// because PacketMath.h is included before TypeCasting.h.
EIGEN_STRONG_INLINE Packet16i preinterpret_float_to_int(const Packet16f& a) { return reinterpret_cast<Packet16i>(a); }
EIGEN_STRONG_INLINE Packet16f preinterpret_int_to_float(const Packet16i& a) { return reinterpret_cast<Packet16f>(a); }
EIGEN_STRONG_INLINE Packet8l preinterpret_double_to_long(const Packet8d& a) { return reinterpret_cast<Packet8l>(a); }
EIGEN_STRONG_INLINE Packet8d preinterpret_long_to_double(const Packet8l& a) { return reinterpret_cast<Packet8d>(a); }
EIGEN_STRONG_INLINE PacketXi preinterpret_float_to_int(const PacketXf& a) { return reinterpret_cast<PacketXi>(a); }
EIGEN_STRONG_INLINE PacketXf preinterpret_int_to_float(const PacketXi& a) { return reinterpret_cast<PacketXf>(a); }
EIGEN_STRONG_INLINE PacketXl preinterpret_double_to_long(const PacketXd& a) { return reinterpret_cast<PacketXl>(a); }
EIGEN_STRONG_INLINE PacketXd preinterpret_long_to_double(const PacketXl& a) { return reinterpret_cast<PacketXd>(a); }
} // namespace detail
@@ -368,8 +386,8 @@ EIGEN_STRONG_INLINE Packet8d preinterpret_long_to_double(const Packet8l& a) { re
return a << N; \
}
EIGEN_CLANG_PACKET_BITWISE_INT(Packet16i)
EIGEN_CLANG_PACKET_BITWISE_INT(Packet8l)
EIGEN_CLANG_PACKET_BITWISE_INT(PacketXi)
EIGEN_CLANG_PACKET_BITWISE_INT(PacketXl)
#undef EIGEN_CLANG_PACKET_BITWISE_INT
// Bitwise ops for floating point packets
@@ -401,8 +419,8 @@ EIGEN_CLANG_PACKET_BITWISE_INT(Packet8l)
return CAST_FROM_INT(CAST_TO_INT(a) & ~CAST_TO_INT(b)); \
}
EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet16f, detail::preinterpret_float_to_int, detail::preinterpret_int_to_float)
EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet8d, detail::preinterpret_double_to_long, detail::preinterpret_long_to_double)
EIGEN_CLANG_PACKET_BITWISE_FLOAT(PacketXf, detail::preinterpret_float_to_int, detail::preinterpret_int_to_float)
EIGEN_CLANG_PACKET_BITWISE_FLOAT(PacketXd, detail::preinterpret_double_to_long, detail::preinterpret_long_to_double)
#undef EIGEN_CLANG_PACKET_BITWISE_FLOAT
// --- Comparison operations ---
@@ -428,8 +446,8 @@ EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet8d, detail::preinterpret_double_to_long,
return numext::bit_cast<PACKET_TYPE>(INT_PACKET_TYPE(!(a >= b))); \
}
EIGEN_CLANG_PACKET_CMP(Packet16f, Packet16i)
EIGEN_CLANG_PACKET_CMP(Packet8d, Packet8l)
EIGEN_CLANG_PACKET_CMP(PacketXf, PacketXi)
EIGEN_CLANG_PACKET_CMP(PacketXd, PacketXl)
#undef EIGEN_CLANG_PACKET_CMP
// --- Min/Max operations ---
@@ -472,10 +490,10 @@ EIGEN_CLANG_PACKET_CMP(Packet8d, Packet8l)
return mask != 0 ? a : b; \
}
EIGEN_CLANG_PACKET_ELEMENTWISE(Packet16f)
EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8d)
EIGEN_CLANG_PACKET_ELEMENTWISE(Packet16i)
EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8l)
EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXf)
EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXd)
EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXi)
EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXl)
#undef EIGEN_CLANG_PACKET_ELEMENTWISE
#endif
@@ -510,8 +528,8 @@ EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8l)
return __builtin_elementwise_sqrt(a); \
}
EIGEN_CLANG_PACKET_MATH_FLOAT(Packet16f)
EIGEN_CLANG_PACKET_MATH_FLOAT(Packet8d)
EIGEN_CLANG_PACKET_MATH_FLOAT(PacketXf)
EIGEN_CLANG_PACKET_MATH_FLOAT(PacketXd)
#undef EIGEN_CLANG_PACKET_MATH_FLOAT
#endif
@@ -563,8 +581,8 @@ EIGEN_CLANG_PACKET_MATH_FLOAT(Packet8d)
}
#endif
EIGEN_CLANG_PACKET_MADD(Packet16f)
EIGEN_CLANG_PACKET_MADD(Packet8d)
EIGEN_CLANG_PACKET_MADD(PacketXf)
EIGEN_CLANG_PACKET_MADD(PacketXd)
#undef EIGEN_CLANG_PACKET_MADD
#define EIGEN_CLANG_PACKET_SCATTER_GATHER(PACKET_TYPE) \
@@ -586,10 +604,10 @@ EIGEN_CLANG_PACKET_MADD(Packet8d)
return result; \
}
EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16f)
EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8d)
EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16i)
EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8l)
EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXf)
EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXd)
EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXi)
EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXl)
#undef EIGEN_CLANG_PACKET_SCATTER_GATHER
@@ -597,6 +615,14 @@ EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8l)
#if EIGEN_HAS_BUILTIN(__builtin_shufflevector)
namespace detail {
template <typename Packet>
EIGEN_STRONG_INLINE Packet preverse_impl_2(const Packet& a) {
return __builtin_shufflevector(a, a, 1, 0);
}
template <typename Packet>
EIGEN_STRONG_INLINE Packet preverse_impl_4(const Packet& a) {
return __builtin_shufflevector(a, a, 3, 2, 1, 0);
}
template <typename Packet>
EIGEN_STRONG_INLINE Packet preverse_impl_8(const Packet& a) {
return __builtin_shufflevector(a, a, 7, 6, 5, 4, 3, 2, 1, 0);
}
@@ -606,33 +632,81 @@ EIGEN_STRONG_INLINE Packet preverse_impl_16(const Packet& a) {
}
} // namespace detail
#define EIGEN_CLANG_PACKET_REVERSE(PACKET_TYPE, SIZE) \
template <> \
EIGEN_STRONG_INLINE PACKET_TYPE preverse<PACKET_TYPE>(const PACKET_TYPE& a) { \
return detail::preverse_impl_##SIZE(a); \
}
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
EIGEN_CLANG_PACKET_REVERSE(Packet16f, 16)
EIGEN_CLANG_PACKET_REVERSE(Packet8d, 8)
EIGEN_CLANG_PACKET_REVERSE(Packet16i, 16)
EIGEN_CLANG_PACKET_REVERSE(Packet8l, 8)
#undef EIGEN_CLANG_PACKET_REVERSE
template <>
EIGEN_STRONG_INLINE PacketXf preverse<PacketXf>(const PacketXf& a) {
return detail::preverse_impl_4(a);
}
template <>
EIGEN_STRONG_INLINE PacketXd preverse<PacketXd>(const PacketXd& a) {
return detail::preverse_impl_2(a);
}
template <>
EIGEN_STRONG_INLINE PacketXi preverse<PacketXi>(const PacketXi& a) {
return detail::preverse_impl_4(a);
}
template <>
EIGEN_STRONG_INLINE PacketXl preverse<PacketXl>(const PacketXl& a) {
return detail::preverse_impl_2(a);
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE PacketXf preverse<PacketXf>(const PacketXf& a) {
return detail::preverse_impl_8(a);
}
template <>
EIGEN_STRONG_INLINE PacketXd preverse<PacketXd>(const PacketXd& a) {
return detail::preverse_impl_4(a);
}
template <>
EIGEN_STRONG_INLINE PacketXi preverse<PacketXi>(const PacketXi& a) {
return detail::preverse_impl_8(a);
}
template <>
EIGEN_STRONG_INLINE PacketXl preverse<PacketXl>(const PacketXl& a) {
return detail::preverse_impl_4(a);
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE PacketXf preverse<PacketXf>(const PacketXf& a) {
return detail::preverse_impl_16(a);
}
template <>
EIGEN_STRONG_INLINE PacketXd preverse<PacketXd>(const PacketXd& a) {
return detail::preverse_impl_8(a);
}
template <>
EIGEN_STRONG_INLINE PacketXi preverse<PacketXi>(const PacketXi& a) {
return detail::preverse_impl_16(a);
}
template <>
EIGEN_STRONG_INLINE PacketXl preverse<PacketXl>(const PacketXl& a) {
return detail::preverse_impl_8(a);
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
namespace detail {
template <typename Packet>
EIGEN_STRONG_INLINE Packet ploaddup16(const typename unpacket_traits<Packet>::type* from) {
EIGEN_STRONG_INLINE Packet ploaddup2(const typename unpacket_traits<Packet>::type* from) {
static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
using HalfPacket = HalfPacket<Packet>;
HalfPacket a = load_vector_unaligned<HalfPacket>(from);
return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
return __builtin_shufflevector(a, a, 0, 0);
}
template <typename Packet>
EIGEN_STRONG_INLINE Packet ploadquad16(const typename unpacket_traits<Packet>::type* from) {
static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
using QuarterPacket = QuarterPacket<Packet>;
QuarterPacket a = load_vector_unaligned<QuarterPacket>(from);
return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3);
EIGEN_STRONG_INLINE Packet ploaddup4(const typename unpacket_traits<Packet>::type* from) {
static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
using HalfPacket = HalfPacket<Packet>;
HalfPacket a = load_vector_unaligned<HalfPacket>(from);
return __builtin_shufflevector(a, a, 0, 0, 1, 1);
}
template <typename Packet>
@@ -643,6 +717,22 @@ EIGEN_STRONG_INLINE Packet ploaddup8(const typename unpacket_traits<Packet>::typ
return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3);
}
template <typename Packet>
EIGEN_STRONG_INLINE Packet ploaddup16(const typename unpacket_traits<Packet>::type* from) {
static_assert((unpacket_traits<Packet>::size) % 2 == 0, "Packet size must be a multiple of 2");
using HalfPacket = HalfPacket<Packet>;
HalfPacket a = load_vector_unaligned<HalfPacket>(from);
return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
}
template <typename Packet>
EIGEN_STRONG_INLINE Packet ploadquad4(const typename unpacket_traits<Packet>::type* from) {
static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
using QuarterPacket = QuarterPacket<Packet>;
QuarterPacket a = load_vector_unaligned<QuarterPacket>(from);
return __builtin_shufflevector(a, a, 0, 0, 0, 0);
}
template <typename Packet>
EIGEN_STRONG_INLINE Packet ploadquad8(const typename unpacket_traits<Packet>::type* from) {
static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
@@ -651,84 +741,241 @@ EIGEN_STRONG_INLINE Packet ploadquad8(const typename unpacket_traits<Packet>::ty
return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1);
}
template <typename Packet>
EIGEN_STRONG_INLINE Packet ploadquad16(const typename unpacket_traits<Packet>::type* from) {
static_assert((unpacket_traits<Packet>::size) % 4 == 0, "Packet size must be a multiple of 4");
using QuarterPacket = QuarterPacket<Packet>;
QuarterPacket a = load_vector_unaligned<QuarterPacket>(from);
return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3);
}
} // namespace detail
template <>
EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
return detail::ploaddup16<Packet16f>(from);
}
template <>
EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
return detail::ploaddup8<Packet8d>(from);
}
template <>
EIGEN_STRONG_INLINE Packet16i ploaddup<Packet16i>(const int32_t* from) {
return detail::ploaddup16<Packet16i>(from);
}
template <>
EIGEN_STRONG_INLINE Packet8l ploaddup<Packet8l>(const int64_t* from) {
return detail::ploaddup8<Packet8l>(from);
}
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
return detail::ploadquad16<Packet16f>(from);
EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
return detail::ploaddup4<PacketXf>(from);
}
template <>
EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
return detail::ploadquad8<Packet8d>(from);
EIGEN_STRONG_INLINE PacketXd ploaddup<PacketXd>(const double* from) {
return detail::ploaddup2<PacketXd>(from);
}
template <>
EIGEN_STRONG_INLINE Packet16i ploadquad<Packet16i>(const int32_t* from) {
return detail::ploadquad16<Packet16i>(from);
EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const int32_t* from) {
return detail::ploaddup4<PacketXi>(from);
}
template <>
EIGEN_STRONG_INLINE Packet8l ploadquad<Packet8l>(const int64_t* from) {
return detail::ploadquad8<Packet8l>(from);
EIGEN_STRONG_INLINE PacketXl ploaddup<PacketXl>(const int64_t* from) {
return detail::ploaddup2<PacketXl>(from);
}
template <>
EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
return detail::ploadquad4<PacketXf>(from);
}
template <>
EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const int32_t* from) {
return detail::ploadquad4<PacketXi>(from);
}
// No ploadquad for 2-element packets (PacketXd, PacketXl) at 16 bytes.
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) {
Packet16f x{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f, a + 4.0f, a + 5.0f, a + 6.0f, a + 7.0f,
a + 8.0f, a + 9.0f, a + 10.0f, a + 11.0f, a + 12.0f, a + 13.0f, a + 14.0f, a + 15.0f};
return x;
EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
return detail::ploaddup8<PacketXf>(from);
}
template <>
EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
return Packet8d{a + 0.0, a + 1.0, a + 2.0, a + 3.0, a + 4.0, a + 5.0, a + 6.0, a + 7.0};
EIGEN_STRONG_INLINE PacketXd ploaddup<PacketXd>(const double* from) {
return detail::ploaddup4<PacketXd>(from);
}
template <>
EIGEN_STRONG_INLINE Packet16i plset<Packet16i>(const int32_t& a) {
return Packet16i{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7,
a + 8, a + 9, a + 10, a + 11, a + 12, a + 13, a + 14, a + 15};
EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const int32_t* from) {
return detail::ploaddup8<PacketXi>(from);
}
template <>
EIGEN_STRONG_INLINE Packet8l plset<Packet8l>(const int64_t& a) {
return Packet8l{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7};
EIGEN_STRONG_INLINE PacketXl ploaddup<PacketXl>(const int64_t* from) {
return detail::ploaddup4<PacketXl>(from);
}
template <>
EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
return detail::ploadquad8<PacketXf>(from);
}
template <>
EIGEN_STRONG_INLINE PacketXd ploadquad<PacketXd>(const double* from) {
return detail::ploadquad4<PacketXd>(from);
}
template <>
EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const int32_t* from) {
return detail::ploadquad8<PacketXi>(from);
}
template <>
EIGEN_STRONG_INLINE PacketXl ploadquad<PacketXl>(const int64_t* from) {
return detail::ploadquad4<PacketXl>(from);
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /* unused */) {
EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
return detail::ploaddup16<PacketXf>(from);
}
template <>
EIGEN_STRONG_INLINE PacketXd ploaddup<PacketXd>(const double* from) {
return detail::ploaddup8<PacketXd>(from);
}
template <>
EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const int32_t* from) {
return detail::ploaddup16<PacketXi>(from);
}
template <>
EIGEN_STRONG_INLINE PacketXl ploaddup<PacketXl>(const int64_t* from) {
return detail::ploaddup8<PacketXl>(from);
}
template <>
EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
return detail::ploadquad16<PacketXf>(from);
}
template <>
EIGEN_STRONG_INLINE PacketXd ploadquad<PacketXd>(const double* from) {
return detail::ploadquad8<PacketXd>(from);
}
template <>
EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const int32_t* from) {
return detail::ploadquad16<PacketXi>(from);
}
template <>
EIGEN_STRONG_INLINE PacketXl ploadquad<PacketXl>(const int64_t* from) {
return detail::ploadquad8<PacketXl>(from);
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// --- plset ---
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f};
}
template <>
EIGEN_STRONG_INLINE PacketXd plset<PacketXd>(const double& a) {
return PacketXd{a + 0.0, a + 1.0};
}
template <>
EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const int32_t& a) {
return PacketXi{a + 0, a + 1, a + 2, a + 3};
}
template <>
EIGEN_STRONG_INLINE PacketXl plset<PacketXl>(const int64_t& a) {
return PacketXl{a + 0, a + 1};
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f, a + 4.0f, a + 5.0f, a + 6.0f, a + 7.0f};
}
template <>
EIGEN_STRONG_INLINE PacketXd plset<PacketXd>(const double& a) {
return PacketXd{a + 0.0, a + 1.0, a + 2.0, a + 3.0};
}
template <>
EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const int32_t& a) {
return PacketXi{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7};
}
template <>
EIGEN_STRONG_INLINE PacketXl plset<PacketXl>(const int64_t& a) {
return PacketXl{a + 0, a + 1, a + 2, a + 3};
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f, a + 4.0f, a + 5.0f, a + 6.0f, a + 7.0f,
a + 8.0f, a + 9.0f, a + 10.0f, a + 11.0f, a + 12.0f, a + 13.0f, a + 14.0f, a + 15.0f};
}
template <>
EIGEN_STRONG_INLINE PacketXd plset<PacketXd>(const double& a) {
return PacketXd{a + 0.0, a + 1.0, a + 2.0, a + 3.0, a + 4.0, a + 5.0, a + 6.0, a + 7.0};
}
template <>
EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const int32_t& a) {
return PacketXi{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7,
a + 8, a + 9, a + 10, a + 11, a + 12, a + 13, a + 14, a + 15};
}
template <>
EIGEN_STRONG_INLINE PacketXl plset<PacketXl>(const int64_t& a) {
return PacketXl{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7};
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// --- peven_mask ---
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) {
float kTrue = numext::bit_cast<float>(int32_t(-1));
float kFalse = 0.0f;
return Packet16f{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse,
kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
return PacketXf{kTrue, kFalse, kTrue, kFalse};
}
template <>
EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /* unused */) {
EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) {
double kTrue = numext::bit_cast<double>(int64_t(-1l));
double kFalse = 0.0;
return Packet8d{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
return PacketXd{kTrue, kFalse};
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) {
float kTrue = numext::bit_cast<float>(int32_t(-1));
float kFalse = 0.0f;
return PacketXf{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
}
template <>
EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) {
double kTrue = numext::bit_cast<double>(int64_t(-1l));
double kFalse = 0.0;
return PacketXd{kTrue, kFalse, kTrue, kFalse};
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) {
float kTrue = numext::bit_cast<float>(int32_t(-1));
float kFalse = 0.0f;
return PacketXf{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse,
kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
}
template <>
EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) {
double kTrue = numext::bit_cast<double>(int64_t(-1l));
double kFalse = 0.0;
return PacketXd{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse};
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// Helpers for ptranspose.
namespace detail {
template <typename Packet>
EIGEN_ALWAYS_INLINE void zip_in_place16(Packet& p1, Packet& p2) {
Packet tmp = __builtin_shufflevector(p1, p2, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
p2 = __builtin_shufflevector(p1, p2, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
EIGEN_ALWAYS_INLINE void zip_in_place2(Packet& p1, Packet& p2) {
Packet tmp = __builtin_shufflevector(p1, p2, 0, 2);
p2 = __builtin_shufflevector(p1, p2, 1, 3);
p1 = tmp;
}
template <typename Packet>
EIGEN_ALWAYS_INLINE void zip_in_place4(Packet& p1, Packet& p2) {
Packet tmp = __builtin_shufflevector(p1, p2, 0, 4, 1, 5);
p2 = __builtin_shufflevector(p1, p2, 2, 6, 3, 7);
p1 = tmp;
}
@@ -739,28 +986,68 @@ EIGEN_ALWAYS_INLINE void zip_in_place8(Packet& p1, Packet& p2) {
p1 = tmp;
}
template <typename Packet>
EIGEN_ALWAYS_INLINE void zip_in_place16(Packet& p1, Packet& p2) {
Packet tmp = __builtin_shufflevector(p1, p2, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
p2 = __builtin_shufflevector(p1, p2, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
p1 = tmp;
}
template <typename Packet>
void zip_in_place(Packet& p1, Packet& p2);
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet16f>(Packet16f& p1, Packet16f& p2) {
zip_in_place16(p1, p2);
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXf>(PacketXf& p1, PacketXf& p2) {
zip_in_place4(p1, p2);
}
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet8d>(Packet8d& p1, Packet8d& p2) {
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXd>(PacketXd& p1, PacketXd& p2) {
zip_in_place2(p1, p2);
}
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXi>(PacketXi& p1, PacketXi& p2) {
zip_in_place4(p1, p2);
}
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXl>(PacketXl& p1, PacketXl& p2) {
zip_in_place2(p1, p2);
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXf>(PacketXf& p1, PacketXf& p2) {
zip_in_place8(p1, p2);
}
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet16i>(Packet16i& p1, Packet16i& p2) {
zip_in_place16(p1, p2);
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXd>(PacketXd& p1, PacketXd& p2) {
zip_in_place4(p1, p2);
}
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet8l>(Packet8l& p1, Packet8l& p2) {
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXi>(PacketXi& p1, PacketXi& p2) {
zip_in_place8(p1, p2);
}
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXl>(PacketXl& p1, PacketXl& p2) {
zip_in_place4(p1, p2);
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXf>(PacketXf& p1, PacketXf& p2) {
zip_in_place16(p1, p2);
}
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXd>(PacketXd& p1, PacketXd& p2) {
zip_in_place8(p1, p2);
}
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXi>(PacketXi& p1, PacketXi& p2) {
zip_in_place16(p1, p2);
}
template <>
EIGEN_ALWAYS_INLINE void zip_in_place<PacketXl>(PacketXl& p1, PacketXl& p2) {
zip_in_place8(p1, p2);
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
template <typename Packet>
EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
@@ -812,62 +1099,69 @@ EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
} // namespace detail
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
// ptranspose overloads: only emit valid block sizes per vector size.
// At 16 bytes: float has 4 elems, double has 2 elems.
// At 32 bytes: float has 8 elems, double has 4 elems.
// At 64 bytes: float has 16 elems, double has 8 elems.
// All sizes support PacketBlock<PacketXf, 2> and PacketBlock<PacketXf, 4>.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 4>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 2>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 8>& kernel) {
// All sizes support PacketBlock<PacketXd, 2>.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXd, 2>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
// All sizes support PacketBlock<PacketXi, 2> and PacketBlock<PacketXi, 4>.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 4>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 2>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 2>& kernel) {
// All sizes support PacketBlock<PacketXl, 2>.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXl, 2>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32
// 32+ bytes: float has 8+ elems, double has 4+ elems.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 8>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXd, 4>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 8>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXl, 4>& kernel) {
detail::ptranspose_impl(kernel);
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64
// 64 bytes: float has 16 elems, double has 8 elems.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXf, 16>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8d, 2>& kernel) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXd, 8>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 16>& kernel) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXi, 16>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 8>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 4>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16i, 2>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8l, 8>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8l, 4>& kernel) {
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8l, 2>& kernel) {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<PacketXl, 8>& kernel) {
detail::ptranspose_impl(kernel);
}
#endif
#endif
} // end namespace internal
} // end namespace Eigen

View File

@@ -33,10 +33,10 @@ namespace internal {
return __builtin_reduce_or(a != 0) != 0; \
}
EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet16f)
EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8d)
EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet16i)
EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8l)
EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXf)
EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXd)
EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXi)
EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXl)
#undef EIGEN_CLANG_PACKET_REDUX_MINMAX
#endif
@@ -52,13 +52,38 @@ EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8l)
}
// __builtin_reduce_{mul,add} are only defined for integer types.
EIGEN_CLANG_PACKET_REDUX_INT(Packet16i)
EIGEN_CLANG_PACKET_REDUX_INT(Packet8l)
EIGEN_CLANG_PACKET_REDUX_INT(PacketXi)
EIGEN_CLANG_PACKET_REDUX_INT(PacketXl)
#undef EIGEN_CLANG_PACKET_REDUX_INT
#endif
#if EIGEN_HAS_BUILTIN(__builtin_shufflevector)
namespace detail {
// Reduction helpers for different vector sizes.
// Each returns a pair of (even-sum, odd-sum) or (even-product, odd-product).
template <typename VectorT>
EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd2(
const VectorT& a) {
return {a[0], a[1]};
}
template <typename VectorT>
EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd4(
const VectorT& a) {
const auto t1 = __builtin_shufflevector(a, a, 0, 1) + __builtin_shufflevector(a, a, 2, 3);
return {t1[0], t1[1]};
}
template <typename VectorT>
EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd8(
const VectorT& a) {
const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) + __builtin_shufflevector(a, a, 4, 5, 6, 7);
const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) + __builtin_shufflevector(t1, t1, 2, 3);
return {t2[0], t2[1]};
}
template <typename VectorT>
EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd16(
const VectorT& a) {
@@ -70,10 +95,23 @@ EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_v
}
template <typename VectorT>
EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceAdd8(
EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul2(
const VectorT& a) {
const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) + __builtin_shufflevector(a, a, 4, 5, 6, 7);
const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) + __builtin_shufflevector(t1, t1, 2, 3);
return {a[0], a[1]};
}
template <typename VectorT>
EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul4(
const VectorT& a) {
const auto t1 = __builtin_shufflevector(a, a, 0, 1) * __builtin_shufflevector(a, a, 2, 3);
return {t1[0], t1[1]};
}
template <typename VectorT>
EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul8(
const VectorT& a) {
const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) * __builtin_shufflevector(a, a, 4, 5, 6, 7);
const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) * __builtin_shufflevector(t1, t1, 2, 3);
return {t2[0], t2[1]};
}
@@ -86,57 +124,188 @@ EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_v
const auto t3 = __builtin_shufflevector(t2, t2, 0, 1) * __builtin_shufflevector(t2, t2, 2, 3);
return {t3[0], t3[1]};
}
template <typename VectorT>
EIGEN_STRONG_INLINE std::pair<scalar_type_of_vector_t<VectorT>, scalar_type_of_vector_t<VectorT>> ReduceMul8(
const VectorT& a) {
const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) * __builtin_shufflevector(a, a, 4, 5, 6, 7);
const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) * __builtin_shufflevector(t1, t1, 2, 3);
return {t2[0], t2[1]};
}
} // namespace detail
// --- predux and predux_mul for float ---
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
float even, odd;
std::tie(even, odd) = detail::ReduceAdd4(a);
return even + odd;
}
template <>
EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
float even, odd;
std::tie(even, odd) = detail::ReduceMul4(a);
return even * odd;
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
float even, odd;
std::tie(even, odd) = detail::ReduceAdd8(a);
return even + odd;
}
template <>
EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
float even, odd;
std::tie(even, odd) = detail::ReduceMul8(a);
return even * odd;
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
float even, odd;
std::tie(even, odd) = detail::ReduceAdd16(a);
return even + odd;
}
template <>
EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
float even, odd;
std::tie(even, odd) = detail::ReduceMul16(a);
return even * odd;
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// --- predux and predux_mul for double ---
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
double even, odd;
std::tie(even, odd) = detail::ReduceAdd2(a);
return even + odd;
}
template <>
EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
double even, odd;
std::tie(even, odd) = detail::ReduceMul2(a);
return even * odd;
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
double even, odd;
std::tie(even, odd) = detail::ReduceAdd4(a);
return even + odd;
}
template <>
EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
double even, odd;
std::tie(even, odd) = detail::ReduceMul4(a);
return even * odd;
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE double predux<PacketXd>(const PacketXd& a) {
double even, odd;
std::tie(even, odd) = detail::ReduceAdd8(a);
return even + odd;
}
template <>
EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
float even, odd;
std::tie(even, odd) = detail::ReduceMul16(a);
return even * odd;
}
template <>
EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
EIGEN_STRONG_INLINE double predux_mul<PacketXd>(const PacketXd& a) {
double even, odd;
std::tie(even, odd) = detail::ReduceMul8(a);
return even * odd;
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// --- predux for complex<float> ---
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a) {
EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
float re, im;
std::tie(re, im) = detail::ReduceAdd4(a.v);
return std::complex<float>(re, im);
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
float re, im;
std::tie(re, im) = detail::ReduceAdd8(a.v);
return std::complex<float>(re, im);
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE std::complex<float> predux<PacketXcf>(const PacketXcf& a) {
float re, im;
std::tie(re, im) = detail::ReduceAdd16(a.v);
return std::complex<float>(re, im);
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// --- predux for complex<double> ---
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a) {
EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
// 1 complex double: just return it
return a[0];
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
double re, im;
std::tie(re, im) = detail::ReduceAdd4(a.v);
return std::complex<double>(re, im);
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE std::complex<double> predux<PacketXcd>(const PacketXcd& a) {
double re, im;
std::tie(re, im) = detail::ReduceAdd8(a.v);
return std::complex<double>(re, im);
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// --- predux_mul for complex<float> ---
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a) {
EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
// 2 complex floats: just multiply them
return a[0] * a[1];
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
// 4 complex floats: split into 2+2, multiply, then scalar multiply
const Packet2cf lower2 = Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3));
const Packet2cf upper2 = Packet2cf(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7));
const Packet2cf prod2 = pmul<Packet2cf>(lower2, upper2);
return prod2[0] * prod2[1];
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE std::complex<float> predux_mul<PacketXcf>(const PacketXcf& a) {
// 8 complex floats: 8->4->2->scalar
const Packet4cf lower4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3, 4, 5, 6, 7));
const Packet4cf upper4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 8, 9, 10, 11, 12, 13, 14, 15));
const Packet4cf prod4 = pmul<Packet4cf>(lower4, upper4);
@@ -146,14 +315,38 @@ EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a
return prod2[0] * prod2[1];
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
// --- predux_mul for complex<double> ---
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
template <>
EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a) {
EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
// 1 complex double: just return it
return a[0];
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
template <>
EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
// 2 complex doubles: just multiply them
return a[0] * a[1];
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
template <>
EIGEN_STRONG_INLINE std::complex<double> predux_mul<PacketXcd>(const PacketXcd& a) {
// 4 complex doubles: split into 2+2, multiply, then scalar multiply
const Packet2cd lower2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3));
const Packet2cd upper2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7));
const Packet2cd prod2 = pmul<Packet2cd>(lower2, upper2);
return prod2[0] * prod2[1];
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
#endif
} // end namespace internal

View File

@@ -20,56 +20,140 @@ namespace internal {
// preinterpret
//==============================================================================
template <>
EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet16i>(const Packet16i& a) {
return reinterpret_cast<Packet16f>(a);
EIGEN_STRONG_INLINE PacketXf preinterpret<PacketXf, PacketXi>(const PacketXi& a) {
return reinterpret_cast<PacketXf>(a);
}
template <>
EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i, Packet16f>(const Packet16f& a) {
return reinterpret_cast<Packet16i>(a);
EIGEN_STRONG_INLINE PacketXi preinterpret<PacketXi, PacketXf>(const PacketXf& a) {
return reinterpret_cast<PacketXi>(a);
}
template <>
EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet8l>(const Packet8l& a) {
return reinterpret_cast<Packet8d>(a);
EIGEN_STRONG_INLINE PacketXd preinterpret<PacketXd, PacketXl>(const PacketXl& a) {
return reinterpret_cast<PacketXd>(a);
}
template <>
EIGEN_STRONG_INLINE Packet8l preinterpret<Packet8l, Packet8d>(const Packet8d& a) {
return reinterpret_cast<Packet8l>(a);
EIGEN_STRONG_INLINE PacketXl preinterpret<PacketXl, PacketXd>(const PacketXd& a) {
return reinterpret_cast<PacketXl>(a);
}
//==============================================================================
// pcast
//==============================================================================
#if EIGEN_HAS_BUILTIN(__builtin_convertvector)
// Float-to-int conversions: __builtin_convertvector has UB for NaN/inf/
// out-of-range inputs. Replace NaN with 0 before converting so that
// pldexp_fast (which may pass NaN exponents) doesn't trigger UB.
template <>
EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
return __builtin_convertvector(a, Packet16i);
EIGEN_STRONG_INLINE PacketXi pcast<PacketXf, PacketXi>(const PacketXf& a) {
const PacketXf safe = a == a ? a : PacketXf(0);
return __builtin_convertvector(safe, PacketXi);
}
template <>
EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
return __builtin_convertvector(a, Packet16f);
EIGEN_STRONG_INLINE PacketXf pcast<PacketXi, PacketXf>(const PacketXi& a) {
return __builtin_convertvector(a, PacketXf);
}
template <>
EIGEN_STRONG_INLINE Packet8l pcast<Packet8d, Packet8l>(const Packet8d& a) {
return __builtin_convertvector(a, Packet8l);
EIGEN_STRONG_INLINE PacketXl pcast<PacketXd, PacketXl>(const PacketXd& a) {
const PacketXd safe = a == a ? a : PacketXd(0);
return __builtin_convertvector(safe, PacketXl);
}
template <>
EIGEN_STRONG_INLINE Packet8d pcast<Packet8l, Packet8d>(const Packet8l& a) {
return __builtin_convertvector(a, Packet8d);
EIGEN_STRONG_INLINE PacketXd pcast<PacketXl, PacketXd>(const PacketXl& a) {
return __builtin_convertvector(a, PacketXd);
}
// float -> double: converts lower half of floats to doubles
// double -> float: converts two PacketXd to one PacketXf
// int32 -> int64: converts lower half of int32s to int64s
// int64 -> int32: converts two PacketXl to one PacketXi
#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16
// float -> double: converts lower 2 floats to 2 doubles
template <>
EIGEN_STRONG_INLINE PacketXd pcast<PacketXf, PacketXd>(const PacketXf& a) {
using HalfFloat = detail::VectorType<float, 2>;
HalfFloat lo = __builtin_shufflevector(a, a, 0, 1);
return __builtin_convertvector(lo, PacketXd);
}
// double -> float: converts two PacketXd (2 doubles each) to one PacketXf (4 floats)
template <>
EIGEN_STRONG_INLINE PacketXf pcast<PacketXd, PacketXf>(const PacketXd& a, const PacketXd& b) {
using HalfFloat = detail::VectorType<float, 2>;
HalfFloat lo = __builtin_convertvector(a, HalfFloat);
HalfFloat hi = __builtin_convertvector(b, HalfFloat);
return __builtin_shufflevector(lo, hi, 0, 1, 2, 3);
}
// int32 -> int64: converts lower 2 int32s to 2 int64s
template <>
EIGEN_STRONG_INLINE PacketXl pcast<PacketXi, PacketXl>(const PacketXi& a) {
using HalfInt = detail::VectorType<int32_t, 2>;
HalfInt lo = __builtin_shufflevector(a, a, 0, 1);
return __builtin_convertvector(lo, PacketXl);
}
// int64 -> int32: converts two PacketXl (2 int64s each) to one PacketXi (4 int32s)
template <>
EIGEN_STRONG_INLINE PacketXi pcast<PacketXl, PacketXi>(const PacketXl& a, const PacketXl& b) {
using HalfInt = detail::VectorType<int32_t, 2>;
HalfInt lo = __builtin_convertvector(a, HalfInt);
HalfInt hi = __builtin_convertvector(b, HalfInt);
return __builtin_shufflevector(lo, hi, 0, 1, 2, 3);
}
#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32
// float -> double: converts lower 4 floats to 4 doubles
template <>
EIGEN_STRONG_INLINE PacketXd pcast<PacketXf, PacketXd>(const PacketXf& a) {
using HalfFloat = detail::VectorType<float, 4>;
HalfFloat lo = __builtin_shufflevector(a, a, 0, 1, 2, 3);
return __builtin_convertvector(lo, PacketXd);
}
// double -> float: converts two PacketXd (4 doubles each) to one PacketXf (8 floats)
template <>
EIGEN_STRONG_INLINE PacketXf pcast<PacketXd, PacketXf>(const PacketXd& a, const PacketXd& b) {
using HalfFloat = detail::VectorType<float, 4>;
HalfFloat lo = __builtin_convertvector(a, HalfFloat);
HalfFloat hi = __builtin_convertvector(b, HalfFloat);
return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7);
}
// int32 -> int64: converts lower 4 int32s to 4 int64s
template <>
EIGEN_STRONG_INLINE PacketXl pcast<PacketXi, PacketXl>(const PacketXi& a) {
using HalfInt = detail::VectorType<int32_t, 4>;
HalfInt lo = __builtin_shufflevector(a, a, 0, 1, 2, 3);
return __builtin_convertvector(lo, PacketXl);
}
// int64 -> int32: converts two PacketXl (4 int64s each) to one PacketXi (8 int32s)
template <>
EIGEN_STRONG_INLINE PacketXi pcast<PacketXl, PacketXi>(const PacketXl& a, const PacketXl& b) {
using HalfInt = detail::VectorType<int32_t, 4>;
HalfInt lo = __builtin_convertvector(a, HalfInt);
HalfInt hi = __builtin_convertvector(b, HalfInt);
return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7);
}
#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64
// float -> double: converts lower 8 floats to 8 doubles
template <>
EIGEN_STRONG_INLINE Packet8d pcast<Packet16f, Packet8d>(const Packet16f& a) {
EIGEN_STRONG_INLINE PacketXd pcast<PacketXf, PacketXd>(const PacketXf& a) {
using HalfFloat = detail::VectorType<float, 8>;
HalfFloat lo = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7);
return __builtin_convertvector(lo, Packet8d);
return __builtin_convertvector(lo, PacketXd);
}
// double -> float: converts two Packet8d to one Packet16f
// double -> float: converts two PacketXd to one PacketXf
template <>
EIGEN_STRONG_INLINE Packet16f pcast<Packet8d, Packet16f>(const Packet8d& a, const Packet8d& b) {
EIGEN_STRONG_INLINE PacketXf pcast<PacketXd, PacketXf>(const PacketXd& a, const PacketXd& b) {
using HalfFloat = detail::VectorType<float, 8>;
HalfFloat lo = __builtin_convertvector(a, HalfFloat);
HalfFloat hi = __builtin_convertvector(b, HalfFloat);
@@ -78,20 +162,22 @@ EIGEN_STRONG_INLINE Packet16f pcast<Packet8d, Packet16f>(const Packet8d& a, cons
// int32 -> int64: converts lower 8 int32s to 8 int64s
template <>
EIGEN_STRONG_INLINE Packet8l pcast<Packet16i, Packet8l>(const Packet16i& a) {
EIGEN_STRONG_INLINE PacketXl pcast<PacketXi, PacketXl>(const PacketXi& a) {
using HalfInt = detail::VectorType<int32_t, 8>;
HalfInt lo = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7);
return __builtin_convertvector(lo, Packet8l);
return __builtin_convertvector(lo, PacketXl);
}
// int64 -> int32: converts two Packet8l to one Packet16i
// int64 -> int32: converts two PacketXl to one PacketXi
template <>
EIGEN_STRONG_INLINE Packet16i pcast<Packet8l, Packet16i>(const Packet8l& a, const Packet8l& b) {
EIGEN_STRONG_INLINE PacketXi pcast<PacketXl, PacketXi>(const PacketXl& a, const PacketXl& b) {
using HalfInt = detail::VectorType<int32_t, 8>;
HalfInt lo = __builtin_convertvector(a, HalfInt);
HalfInt hi = __builtin_convertvector(b, HalfInt);
return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
}
#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES
#endif
} // end namespace internal

View File

@@ -186,6 +186,17 @@ ei_add_test(mixingtypes)
ei_add_test(float_conversion)
ei_add_test(io)
ei_add_test(packetmath "-DEIGEN_FAST_MATH=1")
# Generic clang vector backend tests for different vector sizes.
include(CheckCXXSourceCompiles)
check_cxx_source_compiles("
typedef float v4sf __attribute__((ext_vector_type(4)));
int main() { return __builtin_vectorelements(v4sf{}); }
" COMPILER_SUPPORTS_VECTOR_EXTENSIONS)
if(COMPILER_SUPPORTS_VECTOR_EXTENSIONS)
ei_add_test(packetmath_generic_16 "-DEIGEN_FAST_MATH=1")
ei_add_test(packetmath_generic_32 "-DEIGEN_FAST_MATH=1")
ei_add_test(packetmath_generic_64 "-DEIGEN_FAST_MATH=1")
endif()
ei_add_test(packet_segment)
ei_add_test(vectorization_logic)
ei_add_test(basicstuff)

View File

@@ -0,0 +1,4 @@
// Force the generic clang vector backend with 16-byte vectors.
#define EIGEN_VECTORIZE_GENERIC 1
#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 16
#include "packetmath.cpp"

View File

@@ -0,0 +1,4 @@
// Force the generic clang vector backend with 32-byte vectors.
#define EIGEN_VECTORIZE_GENERIC 1
#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 32
#include "packetmath.cpp"

View File

@@ -0,0 +1,4 @@
// Force the generic clang vector backend with 64-byte vectors.
#define EIGEN_VECTORIZE_GENERIC 1
#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 64
#include "packetmath.cpp"