diff --git a/Eigen/src/Core/arch/clang/Complex.h b/Eigen/src/Core/arch/clang/Complex.h index 6b8e7768e..cfcc229bb 100644 --- a/Eigen/src/Core/arch/clang/Complex.h +++ b/Eigen/src/Core/arch/clang/Complex.h @@ -27,11 +27,21 @@ struct complex_packet_wrapper { RealPacketT v; }; -using Packet8cf = complex_packet_wrapper; -using Packet4cf = complex_packet_wrapper; +// --- Primary complex packet aliases --- +constexpr int kComplexFloatSize = kFloatPacketSize / 2; // 2, 4, or 8 +constexpr int kComplexDoubleSize = kDoublePacketSize / 2; // 1, 2, or 4 +using PacketXcf = complex_packet_wrapper; +using PacketXcd = complex_packet_wrapper; + +// Sub-packet types needed for reductions at larger sizes. +// When PacketXcf IS already a given size, we skip the alias to avoid duplicates. +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32 using Packet2cf = complex_packet_wrapper; -using Packet4cd = complex_packet_wrapper; +#endif +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64 +using Packet4cf = complex_packet_wrapper; using Packet2cd = complex_packet_wrapper; +#endif struct generic_complex_packet_traits : default_packet_traits { enum { @@ -58,39 +68,39 @@ struct generic_complex_packet_traits : default_packet_traits { template <> struct packet_traits> : generic_complex_packet_traits { - using type = Packet8cf; - using half = Packet8cf; + using type = PacketXcf; + using half = PacketXcf; enum { - size = 8, + size = kComplexFloatSize, }; }; template <> -struct unpacket_traits : generic_unpacket_traits { +struct unpacket_traits : generic_unpacket_traits { using type = std::complex; - using half = Packet8cf; - using as_real = Packet16f; + using half = PacketXcf; + using as_real = PacketXf; enum { - size = 8, + size = kComplexFloatSize, }; }; template <> struct packet_traits> : generic_complex_packet_traits { - using type = Packet4cd; - using half = Packet4cd; + using type = PacketXcd; + using half = PacketXcd; enum { - size = 4, + size = kComplexDoubleSize, }; }; template <> -struct unpacket_traits : generic_unpacket_traits { +struct unpacket_traits : generic_unpacket_traits { using type = std::complex; - using half = Packet4cd; - using as_real = Packet8d; + using half = PacketXcd; + using as_real = PacketXd; enum { - size = 4, + size = kComplexDoubleSize, }; }; @@ -115,24 +125,58 @@ struct unpacket_traits : generic_unpacket_traits { pstore(&numext::real_ref(*to), from.v); \ } -EIGEN_CLANG_COMPLEX_LOAD_STORE(Packet8cf); -EIGEN_CLANG_COMPLEX_LOAD_STORE(Packet4cd); +EIGEN_CLANG_COMPLEX_LOAD_STORE(PacketXcf); +EIGEN_CLANG_COMPLEX_LOAD_STORE(PacketXcd); #undef EIGEN_CLANG_COMPLEX_LOAD_STORE -template <> -EIGEN_STRONG_INLINE Packet8cf pset1(const std::complex& from) { - const float re = numext::real(from); - const float im = numext::imag(from); - return Packet8cf(Packet16f{re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im}); -} +// --- pset1 for complex --- +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 template <> -EIGEN_STRONG_INLINE Packet4cd pset1(const std::complex& from) { +EIGEN_STRONG_INLINE PacketXcf pset1(const std::complex& from) { + const float re = numext::real(from); + const float im = numext::imag(from); + return PacketXcf(PacketXf{re, im, re, im}); +} +template <> +EIGEN_STRONG_INLINE PacketXcd pset1(const std::complex& from) { const double re = numext::real(from); const double im = numext::imag(from); - return Packet4cd(Packet8d{re, im, re, im, re, im, re, im}); + return PacketXcd(PacketXd{re, im}); } +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE PacketXcf pset1(const std::complex& from) { + const float re = numext::real(from); + const float im = numext::imag(from); + return PacketXcf(PacketXf{re, im, re, im, re, im, re, im}); +} +template <> +EIGEN_STRONG_INLINE PacketXcd pset1(const std::complex& from) { + const double re = numext::real(from); + const double im = numext::imag(from); + return PacketXcd(PacketXd{re, im, re, im}); +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE PacketXcf pset1(const std::complex& from) { + const float re = numext::real(from); + const float im = numext::imag(from); + return PacketXcf(PacketXf{re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im}); +} +template <> +EIGEN_STRONG_INLINE PacketXcd pset1(const std::complex& from) { + const double re = numext::real(from); + const double im = numext::imag(from); + return PacketXcd(PacketXd{re, im, re, im, re, im, re, im}); +} + +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + // ----------- Unary ops ------------------ #define DELEGATE_UNARY_TO_REAL_OP(PACKET_TYPE, OP) \ template <> \ @@ -149,134 +193,348 @@ EIGEN_STRONG_INLINE Packet4cd pset1(const std::complex& from) } \ EIGEN_INSTANTIATE_COMPLEX_MATH_FUNCS(PACKET_TYPE) -EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(Packet8cf); -EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(Packet4cd); +EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(PacketXcf); +EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS(PacketXcd); + +// --- pconj --- +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 template <> -EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a) { - return Packet8cf(__builtin_shufflevector(a.v, -a.v, 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31)); +EIGEN_STRONG_INLINE PacketXcf pconj(const PacketXcf& a) { + return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7)); } template <> +EIGEN_STRONG_INLINE PacketXcd pconj(const PacketXcd& a) { + return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 3)); +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE PacketXcf pconj(const PacketXcf& a) { + return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15)); +} +template <> +EIGEN_STRONG_INLINE PacketXcd pconj(const PacketXcd& a) { + return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7)); +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE PacketXcf pconj(const PacketXcf& a) { + return PacketXcf(__builtin_shufflevector(a.v, -a.v, 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31)); +} +template <> +EIGEN_STRONG_INLINE PacketXcd pconj(const PacketXcd& a) { + return PacketXcd(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15)); +} + +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + +// Sub-packet pconj specializations needed for reductions. +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32 +template <> +EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { + return Packet2cf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7)); +} +#endif +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64 +template <> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) { return Packet4cf(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15)); } template <> -EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { - return Packet2cf(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7)); -} - -template <> -EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a) { - return Packet4cd(__builtin_shufflevector(a.v, -a.v, 0, 9, 2, 11, 4, 13, 6, 15)); -} -template <> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) { return Packet2cd(__builtin_shufflevector(a.v, -a.v, 0, 5, 2, 7)); } +#endif #undef DELEGATE_UNARY_TO_REAL_OP #undef EIGEN_CLANG_COMPLEX_UNARY_CWISE_OPS // Flip real and imaginary parts, i.e. {re(a), im(a)} -> {im(a), re(a)}. +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 + template <> -EIGEN_STRONG_INLINE Packet8cf pcplxflip(const Packet8cf& a) { - return Packet8cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14)); +EIGEN_STRONG_INLINE PacketXcf pcplxflip(const PacketXcf& a) { + return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2)); } template <> +EIGEN_STRONG_INLINE PacketXcd pcplxflip(const PacketXcd& a) { + return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0)); +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE PacketXcf pcplxflip(const PacketXcf& a) { + return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6)); +} +template <> +EIGEN_STRONG_INLINE PacketXcd pcplxflip(const PacketXcd& a) { + return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2)); +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE PacketXcf pcplxflip(const PacketXcf& a) { + return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14)); +} +template <> +EIGEN_STRONG_INLINE PacketXcd pcplxflip(const PacketXcd& a) { + return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6)); +} + +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + +// Sub-packet pcplxflip specializations needed for reductions. +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32 +template <> +EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) { + return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2)); +} +#endif +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64 +template <> EIGEN_STRONG_INLINE Packet4cf pcplxflip(const Packet4cf& a) { return Packet4cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6)); } template <> -EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) { - return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2)); -} -template <> -EIGEN_STRONG_INLINE Packet4cd pcplxflip(const Packet4cd& a) { - return Packet4cd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2, 5, 4, 7, 6)); -} -template <> EIGEN_STRONG_INLINE Packet2cd pcplxflip(const Packet2cd& a) { return Packet2cd(__builtin_shufflevector(a.v, a.v, 1, 0, 3, 2)); } +#endif // Copy real to imaginary part, i.e. {re(a), im(a)} -> {re(a), re(a)}. +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 + template <> -EIGEN_STRONG_INLINE Packet8cf pdupreal(const Packet8cf& a) { - return Packet8cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14)); +EIGEN_STRONG_INLINE PacketXcf pdupreal(const PacketXcf& a) { + return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2)); } template <> +EIGEN_STRONG_INLINE PacketXcd pdupreal(const PacketXcd& a) { + return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0)); +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE PacketXcf pdupreal(const PacketXcf& a) { + return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6)); +} +template <> +EIGEN_STRONG_INLINE PacketXcd pdupreal(const PacketXcd& a) { + return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2)); +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE PacketXcf pdupreal(const PacketXcf& a) { + return PacketXcf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14)); +} +template <> +EIGEN_STRONG_INLINE PacketXcd pdupreal(const PacketXcd& a) { + return PacketXcd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6)); +} + +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + +// Sub-packet pdupreal specializations needed for reductions. +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32 +template <> +EIGEN_STRONG_INLINE Packet2cf pdupreal(const Packet2cf& a) { + return Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2)); +} +#endif +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64 +template <> EIGEN_STRONG_INLINE Packet4cf pdupreal(const Packet4cf& a) { return Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6)); } template <> -EIGEN_STRONG_INLINE Packet2cf pdupreal(const Packet2cf& a) { - return Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2)); -} -template <> -EIGEN_STRONG_INLINE Packet4cd pdupreal(const Packet4cd& a) { - return Packet4cd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2, 4, 4, 6, 6)); -} -template <> EIGEN_STRONG_INLINE Packet2cd pdupreal(const Packet2cd& a) { return Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 0, 2, 2)); } +#endif // Copy imaginary to real part, i.e. {re(a), im(a)} -> {im(a), im(a)}. +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 + template <> -EIGEN_STRONG_INLINE Packet8cf pdupimag(const Packet8cf& a) { - return Packet8cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15)); +EIGEN_STRONG_INLINE PacketXcf pdupimag(const PacketXcf& a) { + return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3)); } template <> +EIGEN_STRONG_INLINE PacketXcd pdupimag(const PacketXcd& a) { + return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1)); +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE PacketXcf pdupimag(const PacketXcf& a) { + return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7)); +} +template <> +EIGEN_STRONG_INLINE PacketXcd pdupimag(const PacketXcd& a) { + return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3)); +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE PacketXcf pdupimag(const PacketXcf& a) { + return PacketXcf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15)); +} +template <> +EIGEN_STRONG_INLINE PacketXcd pdupimag(const PacketXcd& a) { + return PacketXcd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7)); +} + +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + +// Sub-packet pdupimag specializations needed for reductions. +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32 +template <> +EIGEN_STRONG_INLINE Packet2cf pdupimag(const Packet2cf& a) { + return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3)); +} +#endif +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64 +template <> EIGEN_STRONG_INLINE Packet4cf pdupimag(const Packet4cf& a) { return Packet4cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7)); } template <> -EIGEN_STRONG_INLINE Packet2cf pdupimag(const Packet2cf& a) { - return Packet2cf(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3)); -} -template <> -EIGEN_STRONG_INLINE Packet4cd pdupimag(const Packet4cd& a) { - return Packet4cd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3, 5, 5, 7, 7)); -} -template <> EIGEN_STRONG_INLINE Packet2cd pdupimag(const Packet2cd& a) { return Packet2cd(__builtin_shufflevector(a.v, a.v, 1, 1, 3, 3)); } +#endif + +// --- ploaddup --- +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 template <> -EIGEN_STRONG_INLINE Packet8cf ploaddup(const std::complex* from) { - return Packet8cf(Packet16f{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]), - std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]), - std::real(from[2]), std::imag(from[2]), std::real(from[2]), std::imag(from[2]), - std::real(from[3]), std::imag(from[3]), std::real(from[3]), std::imag(from[3])}); +EIGEN_STRONG_INLINE PacketXcf ploaddup(const std::complex* from) { + return pset1(*from); } template <> -EIGEN_STRONG_INLINE Packet4cd ploaddup(const std::complex* from) { - return Packet4cd(Packet8d{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]), +EIGEN_STRONG_INLINE PacketXcd ploaddup(const std::complex* from) { + return pset1(*from); +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE PacketXcf ploaddup(const std::complex* from) { + return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]), + std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])}); +} +template <> +EIGEN_STRONG_INLINE PacketXcd ploaddup(const std::complex* from) { + return pset1(*from); +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE PacketXcf ploaddup(const std::complex* from) { + return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]), + std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]), + std::real(from[2]), std::imag(from[2]), std::real(from[2]), std::imag(from[2]), + std::real(from[3]), std::imag(from[3]), std::real(from[3]), std::imag(from[3])}); +} +template <> +EIGEN_STRONG_INLINE PacketXcd ploaddup(const std::complex* from) { + return PacketXcd(PacketXd{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]), std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])}); } -template <> -EIGEN_STRONG_INLINE Packet8cf ploadquad(const std::complex* from) { - return Packet8cf(Packet16f{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]), - std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]), - std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]), - std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])}); -} -template <> -EIGEN_STRONG_INLINE Packet4cd ploadquad(const std::complex* from) { - return pset1(*from); -} +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + +// --- ploadquad --- +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 template <> -EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) { - return Packet8cf(reinterpret_cast(preverse(reinterpret_cast(a.v)))); +EIGEN_STRONG_INLINE PacketXcf ploadquad(const std::complex* from) { + return pset1(*from); } template <> -EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) { - return Packet4cd(__builtin_shufflevector(a.v, a.v, 6, 7, 4, 5, 2, 3, 0, 1)); +EIGEN_STRONG_INLINE PacketXcd ploadquad(const std::complex* from) { + return pset1(*from); } +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE PacketXcf ploadquad(const std::complex* from) { + return pset1(*from); +} +template <> +EIGEN_STRONG_INLINE PacketXcd ploadquad(const std::complex* from) { + return pset1(*from); +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE PacketXcf ploadquad(const std::complex* from) { + return PacketXcf(PacketXf{std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]), + std::real(from[0]), std::imag(from[0]), std::real(from[0]), std::imag(from[0]), + std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1]), + std::real(from[1]), std::imag(from[1]), std::real(from[1]), std::imag(from[1])}); +} +template <> +EIGEN_STRONG_INLINE PacketXcd ploadquad(const std::complex* from) { + return pset1(*from); +} + +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + +// --- preverse --- +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 + +template <> +EIGEN_STRONG_INLINE PacketXcf preverse(const PacketXcf& a) { + // 2 complex floats: swap pairs (0,1) and (2,3) + return PacketXcf(__builtin_shufflevector(a.v, a.v, 2, 3, 0, 1)); +} +template <> +EIGEN_STRONG_INLINE PacketXcd preverse(const PacketXcd& a) { + // 1 complex double: identity + return a; +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE PacketXcf preverse(const PacketXcf& a) { + // 4 complex floats: reverse pairs + return PacketXcf(reinterpret_cast(preverse(reinterpret_cast(a.v)))); +} +template <> +EIGEN_STRONG_INLINE PacketXcd preverse(const PacketXcd& a) { + // 2 complex doubles: swap pairs + return PacketXcd(__builtin_shufflevector(a.v, a.v, 2, 3, 0, 1)); +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE PacketXcf preverse(const PacketXcf& a) { + return PacketXcf(reinterpret_cast(preverse(reinterpret_cast(a.v)))); +} +template <> +EIGEN_STRONG_INLINE PacketXcd preverse(const PacketXcd& a) { + return PacketXcd(__builtin_shufflevector(a.v, a.v, 6, 7, 4, 5, 2, 3, 0, 1)); +} + +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + // ----------- Binary ops ------------------ #define DELEGATE_BINARY_TO_REAL_OP(PACKET_TYPE, OP) \ template <> \ @@ -300,8 +558,8 @@ EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) { return PACKET_TYPE(pand(pdupreal(t).v, pdupimag(t).v)); \ } -EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet8cf); -EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet4cd); +EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(PacketXcf); +EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(PacketXcd); // Binary ops that are needed on sub-packets for predux and predux_mul. #define EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PACKET_TYPE) \ @@ -311,11 +569,17 @@ EIGEN_CLANG_COMPLEX_BINARY_CWISE_OPS(Packet4cd); return pmul_complex(a, b); \ } -EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet8cf); -EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cf); +EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PacketXcf); +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32 EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cf); -EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cd); +#endif +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64 +EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet4cf); +#endif +EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(PacketXcd); +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64 EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cd); +#endif #define EIGEN_CLANG_PACKET_SCATTER_GATHER(PACKET_TYPE) \ template <> \ @@ -338,8 +602,8 @@ EIGEN_CLANG_COMPLEX_REDUCER_BINARY_CWISE_OPS(Packet2cd); return result; \ } -EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8cf); -EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet4cd); +EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXcf); +EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXcd); #undef EIGEN_CLANG_PACKET_SCATTER_GATHER #undef DELEGATE_BINARY_TO_REAL_OP @@ -348,46 +612,89 @@ EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet4cd); // ------------ ternary ops ------------- template <> -EIGEN_STRONG_INLINE Packet8cf pselect(const Packet8cf& mask, const Packet8cf& a, const Packet8cf& b) { - return Packet8cf(reinterpret_cast( - pselect(reinterpret_cast(mask.v), reinterpret_cast(a.v), reinterpret_cast(b.v)))); +EIGEN_STRONG_INLINE PacketXcf pselect(const PacketXcf& mask, const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(reinterpret_cast( + pselect(reinterpret_cast(mask.v), reinterpret_cast(a.v), reinterpret_cast(b.v)))); } +// --- zip_in_place for complex --- namespace detail { + +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 + template <> -EIGEN_ALWAYS_INLINE void zip_in_place(Packet8cf& p1, Packet8cf& p2) { - Packet16f tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23); +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXcf& p1, PacketXcf& p2) { + PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 4, 5); + p2.v = __builtin_shufflevector(p1.v, p2.v, 2, 3, 6, 7); + p1.v = tmp; +} +// PacketXcd at 16 bytes has 1 element, no zip_in_place needed. + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXcf& p1, PacketXcf& p2) { + PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11); + p2.v = __builtin_shufflevector(p1.v, p2.v, 4, 5, 12, 13, 6, 7, 14, 15); + p1.v = tmp; +} +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXcd& p1, PacketXcd& p2) { + PacketXd tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 4, 5); + p2.v = __builtin_shufflevector(p1.v, p2.v, 2, 3, 6, 7); + p1.v = tmp; +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXcf& p1, PacketXcf& p2) { + PacketXf tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23); p2.v = __builtin_shufflevector(p1.v, p2.v, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31); p1.v = tmp; } template <> -EIGEN_ALWAYS_INLINE void zip_in_place(Packet4cd& p1, Packet4cd& p2) { - Packet8d tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11); +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXcd& p1, PacketXcd& p2) { + PacketXd tmp = __builtin_shufflevector(p1.v, p2.v, 0, 1, 8, 9, 2, 3, 10, 11); p2.v = __builtin_shufflevector(p1.v, p2.v, 4, 5, 12, 13, 6, 7, 14, 15); p1.v = tmp; } + +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + } // namespace detail -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +// --- ptranspose for complex --- +// PacketXcf: valid block sizes depend on kComplexFloatSize. +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32 +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +#endif +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64 +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { detail::ptranspose_impl(kernel); } +#endif -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +// PacketXcd: valid block sizes depend on kComplexDoubleSize. +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32 +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +#endif +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64 +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { detail::ptranspose_impl(kernel); } +#endif -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf, Packet16f) -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd, Packet8d) +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PacketXcf, PacketXf) +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PacketXcd, PacketXd) } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/clang/MathFunctions.h b/Eigen/src/Core/arch/clang/MathFunctions.h index 706a87051..c2afeda8f 100644 --- a/Eigen/src/Core/arch/clang/MathFunctions.h +++ b/Eigen/src/Core/arch/clang/MathFunctions.h @@ -18,27 +18,27 @@ namespace Eigen { namespace internal { template <> -EIGEN_STRONG_INLINE Packet16f pfrexp(const Packet16f& a, Packet16f& exponent) { +EIGEN_STRONG_INLINE PacketXf pfrexp(const PacketXf& a, PacketXf& exponent) { return pfrexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE Packet8d pfrexp(const Packet8d& a, Packet8d& exponent) { +EIGEN_STRONG_INLINE PacketXd pfrexp(const PacketXd& a, PacketXd& exponent) { return pfrexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE Packet16f pldexp(const Packet16f& a, const Packet16f& exponent) { +EIGEN_STRONG_INLINE PacketXf pldexp(const PacketXf& a, const PacketXf& exponent) { return pldexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, const Packet8d& exponent) { +EIGEN_STRONG_INLINE PacketXd pldexp(const PacketXd& a, const PacketXd& exponent) { return pldexp_generic(a, exponent); } -EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet16f) -EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet8d) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketXf) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketXd) } // end namespace internal diff --git a/Eigen/src/Core/arch/clang/PacketMath.h b/Eigen/src/Core/arch/clang/PacketMath.h index 4beadfe64..84126944f 100644 --- a/Eigen/src/Core/arch/clang/PacketMath.h +++ b/Eigen/src/Core/arch/clang/PacketMath.h @@ -24,14 +24,32 @@ template using VectorType = ScalarT __attribute__((ext_vector_type(n), aligned(n * sizeof(ScalarT)))); } // namespace detail -// --- Primary packet type definitions (fixed at 64 bytes) --- +// --- Naming Convention --- +// This backend uses size-independent type aliases so the same code works +// for EIGEN_GENERIC_VECTOR_SIZE_BYTES in {16, 32, 64}: +// +// PacketXf - float vector (4, 8, or 16 elements) +// PacketXd - double vector (2, 4, or 8 elements) +// PacketXi - int32_t vector (4, 8, or 16 elements) +// PacketXl - int64_t vector (2, 4, or 8 elements) +// PacketXcf - complex vector (2, 4, or 8 elements) [in Complex.h] +// PacketXcd - complex vector (1, 2, or 4 elements) [in Complex.h] +// +// The "X" suffix indicates the element count is determined by the macro +// EIGEN_GENERIC_VECTOR_SIZE_BYTES at compile time. Operations that require +// compile-time constant indices (e.g. __builtin_shufflevector) use +// #if EIGEN_GENERIC_VECTOR_SIZE_BYTES == ... blocks. -// TODO(rmlarsen): Generalize to other vector sizes. -static_assert(EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64, "We currently assume the full vector size is 64 bytes"); -using Packet16f = detail::VectorType; -using Packet8d = detail::VectorType; -using Packet16i = detail::VectorType; -using Packet8l = detail::VectorType; +static_assert(EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 || EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 || + EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64, + "EIGEN_GENERIC_VECTOR_SIZE_BYTES must be 16, 32, or 64"); + +constexpr int kFloatPacketSize = EIGEN_GENERIC_VECTOR_SIZE_BYTES / sizeof(float); +constexpr int kDoublePacketSize = EIGEN_GENERIC_VECTOR_SIZE_BYTES / sizeof(double); +using PacketXf = detail::VectorType; +using PacketXd = detail::VectorType; +using PacketXi = detail::VectorType; +using PacketXl = detail::VectorType; // --- packet_traits specializations --- struct generic_float_packet_traits : default_packet_traits { @@ -82,20 +100,20 @@ struct generic_float_packet_traits : default_packet_traits { template <> struct packet_traits : generic_float_packet_traits { - using type = Packet16f; - using half = Packet16f; + using type = PacketXf; + using half = PacketXf; enum { - size = 16, + size = kFloatPacketSize, }; }; template <> struct packet_traits : generic_float_packet_traits { - using type = Packet8d; - using half = Packet8d; + using type = PacketXd; + using half = PacketXd; // Generic double-precision acos/asin are not yet implemented in // GenericPacketMathFunctions.h (only float versions exist). - enum { size = 8, HasACos = 0, HasASin = 0 }; + enum { size = kDoublePacketSize, HasACos = 0, HasASin = 0 }; }; struct generic_integer_packet_traits : default_packet_traits { @@ -131,19 +149,19 @@ struct generic_integer_packet_traits : default_packet_traits { template <> struct packet_traits : generic_integer_packet_traits { - using type = Packet16i; - using half = Packet16i; + using type = PacketXi; + using half = PacketXi; enum { - size = 16, + size = kFloatPacketSize, }; }; template <> struct packet_traits : generic_integer_packet_traits { - using type = Packet8l; - using half = Packet8l; + using type = PacketXl; + using half = PacketXl; enum { - size = 8, + size = kDoublePacketSize, }; }; @@ -156,37 +174,37 @@ struct generic_unpacket_traits : default_unpacket_traits { }; template <> -struct unpacket_traits : generic_unpacket_traits { +struct unpacket_traits : generic_unpacket_traits { using type = float; - using half = Packet16f; - using integer_packet = Packet16i; + using half = PacketXf; + using integer_packet = PacketXi; enum { - size = 16, + size = kFloatPacketSize, }; }; template <> -struct unpacket_traits : generic_unpacket_traits { +struct unpacket_traits : generic_unpacket_traits { using type = double; - using half = Packet8d; - using integer_packet = Packet8l; + using half = PacketXd; + using integer_packet = PacketXl; enum { - size = 8, + size = kDoublePacketSize, }; }; template <> -struct unpacket_traits : generic_unpacket_traits { +struct unpacket_traits : generic_unpacket_traits { using type = int32_t; - using half = Packet16i; + using half = PacketXi; enum { - size = 16, + size = kFloatPacketSize, }; }; template <> -struct unpacket_traits : generic_unpacket_traits { +struct unpacket_traits : generic_unpacket_traits { using type = int64_t; - using half = Packet8l; + using half = PacketXl; enum { - size = 8, + size = kDoublePacketSize, }; }; @@ -265,21 +283,21 @@ EIGEN_STRONG_INLINE void store_vector_aligned(scalar_type_of_vector_t* detail::store_vector_aligned(to, from); \ } -EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16f) -EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8d) -EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet16i) -EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(Packet8l) +EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXf) +EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXd) +EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXi) +EIGEN_CLANG_PACKET_LOAD_STORE_PACKET(PacketXl) #undef EIGEN_CLANG_PACKET_LOAD_STORE_PACKET // --- Broadcast operation --- template <> -EIGEN_STRONG_INLINE Packet16f pset1frombits(uint32_t from) { - return Packet16f(numext::bit_cast(from)); +EIGEN_STRONG_INLINE PacketXf pset1frombits(uint32_t from) { + return PacketXf(numext::bit_cast(from)); } template <> -EIGEN_STRONG_INLINE Packet8d pset1frombits(uint64_t from) { - return Packet8d(numext::bit_cast(from)); +EIGEN_STRONG_INLINE PacketXd pset1frombits(uint64_t from) { + return PacketXd(numext::bit_cast(from)); } #define EIGEN_CLANG_PACKET_SET1(PACKET_TYPE) \ @@ -292,10 +310,10 @@ EIGEN_STRONG_INLINE Packet8d pset1frombits(uint64_t from) { return from[0]; \ } -EIGEN_CLANG_PACKET_SET1(Packet16f) -EIGEN_CLANG_PACKET_SET1(Packet8d) -EIGEN_CLANG_PACKET_SET1(Packet16i) -EIGEN_CLANG_PACKET_SET1(Packet8l) +EIGEN_CLANG_PACKET_SET1(PacketXf) +EIGEN_CLANG_PACKET_SET1(PacketXd) +EIGEN_CLANG_PACKET_SET1(PacketXi) +EIGEN_CLANG_PACKET_SET1(PacketXl) #undef EIGEN_CLANG_PACKET_SET1 // --- Arithmetic operations --- @@ -309,10 +327,10 @@ EIGEN_CLANG_PACKET_SET1(Packet8l) return -a; \ } -EIGEN_CLANG_PACKET_ARITHMETIC(Packet16f) -EIGEN_CLANG_PACKET_ARITHMETIC(Packet8d) -EIGEN_CLANG_PACKET_ARITHMETIC(Packet16i) -EIGEN_CLANG_PACKET_ARITHMETIC(Packet8l) +EIGEN_CLANG_PACKET_ARITHMETIC(PacketXf) +EIGEN_CLANG_PACKET_ARITHMETIC(PacketXd) +EIGEN_CLANG_PACKET_ARITHMETIC(PacketXi) +EIGEN_CLANG_PACKET_ARITHMETIC(PacketXl) #undef EIGEN_CLANG_PACKET_ARITHMETIC // --- Bitwise operations (via casting) --- @@ -321,10 +339,10 @@ namespace detail { // Reinterpret-cast helpers, equivalent to preinterpret<> but defined here // because PacketMath.h is included before TypeCasting.h. -EIGEN_STRONG_INLINE Packet16i preinterpret_float_to_int(const Packet16f& a) { return reinterpret_cast(a); } -EIGEN_STRONG_INLINE Packet16f preinterpret_int_to_float(const Packet16i& a) { return reinterpret_cast(a); } -EIGEN_STRONG_INLINE Packet8l preinterpret_double_to_long(const Packet8d& a) { return reinterpret_cast(a); } -EIGEN_STRONG_INLINE Packet8d preinterpret_long_to_double(const Packet8l& a) { return reinterpret_cast(a); } +EIGEN_STRONG_INLINE PacketXi preinterpret_float_to_int(const PacketXf& a) { return reinterpret_cast(a); } +EIGEN_STRONG_INLINE PacketXf preinterpret_int_to_float(const PacketXi& a) { return reinterpret_cast(a); } +EIGEN_STRONG_INLINE PacketXl preinterpret_double_to_long(const PacketXd& a) { return reinterpret_cast(a); } +EIGEN_STRONG_INLINE PacketXd preinterpret_long_to_double(const PacketXl& a) { return reinterpret_cast(a); } } // namespace detail @@ -368,8 +386,8 @@ EIGEN_STRONG_INLINE Packet8d preinterpret_long_to_double(const Packet8l& a) { re return a << N; \ } -EIGEN_CLANG_PACKET_BITWISE_INT(Packet16i) -EIGEN_CLANG_PACKET_BITWISE_INT(Packet8l) +EIGEN_CLANG_PACKET_BITWISE_INT(PacketXi) +EIGEN_CLANG_PACKET_BITWISE_INT(PacketXl) #undef EIGEN_CLANG_PACKET_BITWISE_INT // Bitwise ops for floating point packets @@ -401,8 +419,8 @@ EIGEN_CLANG_PACKET_BITWISE_INT(Packet8l) return CAST_FROM_INT(CAST_TO_INT(a) & ~CAST_TO_INT(b)); \ } -EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet16f, detail::preinterpret_float_to_int, detail::preinterpret_int_to_float) -EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet8d, detail::preinterpret_double_to_long, detail::preinterpret_long_to_double) +EIGEN_CLANG_PACKET_BITWISE_FLOAT(PacketXf, detail::preinterpret_float_to_int, detail::preinterpret_int_to_float) +EIGEN_CLANG_PACKET_BITWISE_FLOAT(PacketXd, detail::preinterpret_double_to_long, detail::preinterpret_long_to_double) #undef EIGEN_CLANG_PACKET_BITWISE_FLOAT // --- Comparison operations --- @@ -428,8 +446,8 @@ EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet8d, detail::preinterpret_double_to_long, return numext::bit_cast(INT_PACKET_TYPE(!(a >= b))); \ } -EIGEN_CLANG_PACKET_CMP(Packet16f, Packet16i) -EIGEN_CLANG_PACKET_CMP(Packet8d, Packet8l) +EIGEN_CLANG_PACKET_CMP(PacketXf, PacketXi) +EIGEN_CLANG_PACKET_CMP(PacketXd, PacketXl) #undef EIGEN_CLANG_PACKET_CMP // --- Min/Max operations --- @@ -472,10 +490,10 @@ EIGEN_CLANG_PACKET_CMP(Packet8d, Packet8l) return mask != 0 ? a : b; \ } -EIGEN_CLANG_PACKET_ELEMENTWISE(Packet16f) -EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8d) -EIGEN_CLANG_PACKET_ELEMENTWISE(Packet16i) -EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8l) +EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXf) +EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXd) +EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXi) +EIGEN_CLANG_PACKET_ELEMENTWISE(PacketXl) #undef EIGEN_CLANG_PACKET_ELEMENTWISE #endif @@ -510,8 +528,8 @@ EIGEN_CLANG_PACKET_ELEMENTWISE(Packet8l) return __builtin_elementwise_sqrt(a); \ } -EIGEN_CLANG_PACKET_MATH_FLOAT(Packet16f) -EIGEN_CLANG_PACKET_MATH_FLOAT(Packet8d) +EIGEN_CLANG_PACKET_MATH_FLOAT(PacketXf) +EIGEN_CLANG_PACKET_MATH_FLOAT(PacketXd) #undef EIGEN_CLANG_PACKET_MATH_FLOAT #endif @@ -563,8 +581,8 @@ EIGEN_CLANG_PACKET_MATH_FLOAT(Packet8d) } #endif -EIGEN_CLANG_PACKET_MADD(Packet16f) -EIGEN_CLANG_PACKET_MADD(Packet8d) +EIGEN_CLANG_PACKET_MADD(PacketXf) +EIGEN_CLANG_PACKET_MADD(PacketXd) #undef EIGEN_CLANG_PACKET_MADD #define EIGEN_CLANG_PACKET_SCATTER_GATHER(PACKET_TYPE) \ @@ -586,10 +604,10 @@ EIGEN_CLANG_PACKET_MADD(Packet8d) return result; \ } -EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16f) -EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8d) -EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet16i) -EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8l) +EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXf) +EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXd) +EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXi) +EIGEN_CLANG_PACKET_SCATTER_GATHER(PacketXl) #undef EIGEN_CLANG_PACKET_SCATTER_GATHER @@ -597,6 +615,14 @@ EIGEN_CLANG_PACKET_SCATTER_GATHER(Packet8l) #if EIGEN_HAS_BUILTIN(__builtin_shufflevector) namespace detail { template +EIGEN_STRONG_INLINE Packet preverse_impl_2(const Packet& a) { + return __builtin_shufflevector(a, a, 1, 0); +} +template +EIGEN_STRONG_INLINE Packet preverse_impl_4(const Packet& a) { + return __builtin_shufflevector(a, a, 3, 2, 1, 0); +} +template EIGEN_STRONG_INLINE Packet preverse_impl_8(const Packet& a) { return __builtin_shufflevector(a, a, 7, 6, 5, 4, 3, 2, 1, 0); } @@ -606,33 +632,81 @@ EIGEN_STRONG_INLINE Packet preverse_impl_16(const Packet& a) { } } // namespace detail -#define EIGEN_CLANG_PACKET_REVERSE(PACKET_TYPE, SIZE) \ - template <> \ - EIGEN_STRONG_INLINE PACKET_TYPE preverse(const PACKET_TYPE& a) { \ - return detail::preverse_impl_##SIZE(a); \ - } +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 -EIGEN_CLANG_PACKET_REVERSE(Packet16f, 16) -EIGEN_CLANG_PACKET_REVERSE(Packet8d, 8) -EIGEN_CLANG_PACKET_REVERSE(Packet16i, 16) -EIGEN_CLANG_PACKET_REVERSE(Packet8l, 8) -#undef EIGEN_CLANG_PACKET_REVERSE +template <> +EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) { + return detail::preverse_impl_4(a); +} +template <> +EIGEN_STRONG_INLINE PacketXd preverse(const PacketXd& a) { + return detail::preverse_impl_2(a); +} +template <> +EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) { + return detail::preverse_impl_4(a); +} +template <> +EIGEN_STRONG_INLINE PacketXl preverse(const PacketXl& a) { + return detail::preverse_impl_2(a); +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) { + return detail::preverse_impl_8(a); +} +template <> +EIGEN_STRONG_INLINE PacketXd preverse(const PacketXd& a) { + return detail::preverse_impl_4(a); +} +template <> +EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) { + return detail::preverse_impl_8(a); +} +template <> +EIGEN_STRONG_INLINE PacketXl preverse(const PacketXl& a) { + return detail::preverse_impl_4(a); +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) { + return detail::preverse_impl_16(a); +} +template <> +EIGEN_STRONG_INLINE PacketXd preverse(const PacketXd& a) { + return detail::preverse_impl_8(a); +} +template <> +EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) { + return detail::preverse_impl_16(a); +} +template <> +EIGEN_STRONG_INLINE PacketXl preverse(const PacketXl& a) { + return detail::preverse_impl_8(a); +} + +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES namespace detail { + template -EIGEN_STRONG_INLINE Packet ploaddup16(const typename unpacket_traits::type* from) { +EIGEN_STRONG_INLINE Packet ploaddup2(const typename unpacket_traits::type* from) { static_assert((unpacket_traits::size) % 2 == 0, "Packet size must be a multiple of 2"); using HalfPacket = HalfPacket; HalfPacket a = load_vector_unaligned(from); - return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7); + return __builtin_shufflevector(a, a, 0, 0); } template -EIGEN_STRONG_INLINE Packet ploadquad16(const typename unpacket_traits::type* from) { - static_assert((unpacket_traits::size) % 4 == 0, "Packet size must be a multiple of 4"); - using QuarterPacket = QuarterPacket; - QuarterPacket a = load_vector_unaligned(from); - return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3); +EIGEN_STRONG_INLINE Packet ploaddup4(const typename unpacket_traits::type* from) { + static_assert((unpacket_traits::size) % 2 == 0, "Packet size must be a multiple of 2"); + using HalfPacket = HalfPacket; + HalfPacket a = load_vector_unaligned(from); + return __builtin_shufflevector(a, a, 0, 0, 1, 1); } template @@ -643,6 +717,22 @@ EIGEN_STRONG_INLINE Packet ploaddup8(const typename unpacket_traits::typ return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3); } +template +EIGEN_STRONG_INLINE Packet ploaddup16(const typename unpacket_traits::type* from) { + static_assert((unpacket_traits::size) % 2 == 0, "Packet size must be a multiple of 2"); + using HalfPacket = HalfPacket; + HalfPacket a = load_vector_unaligned(from); + return __builtin_shufflevector(a, a, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7); +} + +template +EIGEN_STRONG_INLINE Packet ploadquad4(const typename unpacket_traits::type* from) { + static_assert((unpacket_traits::size) % 4 == 0, "Packet size must be a multiple of 4"); + using QuarterPacket = QuarterPacket; + QuarterPacket a = load_vector_unaligned(from); + return __builtin_shufflevector(a, a, 0, 0, 0, 0); +} + template EIGEN_STRONG_INLINE Packet ploadquad8(const typename unpacket_traits::type* from) { static_assert((unpacket_traits::size) % 4 == 0, "Packet size must be a multiple of 4"); @@ -651,84 +741,241 @@ EIGEN_STRONG_INLINE Packet ploadquad8(const typename unpacket_traits::ty return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1); } +template +EIGEN_STRONG_INLINE Packet ploadquad16(const typename unpacket_traits::type* from) { + static_assert((unpacket_traits::size) % 4 == 0, "Packet size must be a multiple of 4"); + using QuarterPacket = QuarterPacket; + QuarterPacket a = load_vector_unaligned(from); + return __builtin_shufflevector(a, a, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3); +} + } // namespace detail -template <> -EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) { - return detail::ploaddup16(from); -} -template <> -EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { - return detail::ploaddup8(from); -} -template <> -EIGEN_STRONG_INLINE Packet16i ploaddup(const int32_t* from) { - return detail::ploaddup16(from); -} -template <> -EIGEN_STRONG_INLINE Packet8l ploaddup(const int64_t* from) { - return detail::ploaddup8(from); -} +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 template <> -EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) { - return detail::ploadquad16(from); +EIGEN_STRONG_INLINE PacketXf ploaddup(const float* from) { + return detail::ploaddup4(from); } template <> -EIGEN_STRONG_INLINE Packet8d ploadquad(const double* from) { - return detail::ploadquad8(from); +EIGEN_STRONG_INLINE PacketXd ploaddup(const double* from) { + return detail::ploaddup2(from); } template <> -EIGEN_STRONG_INLINE Packet16i ploadquad(const int32_t* from) { - return detail::ploadquad16(from); +EIGEN_STRONG_INLINE PacketXi ploaddup(const int32_t* from) { + return detail::ploaddup4(from); } template <> -EIGEN_STRONG_INLINE Packet8l ploadquad(const int64_t* from) { - return detail::ploadquad8(from); +EIGEN_STRONG_INLINE PacketXl ploaddup(const int64_t* from) { + return detail::ploaddup2(from); } +template <> +EIGEN_STRONG_INLINE PacketXf ploadquad(const float* from) { + return detail::ploadquad4(from); +} +template <> +EIGEN_STRONG_INLINE PacketXi ploadquad(const int32_t* from) { + return detail::ploadquad4(from); +} +// No ploadquad for 2-element packets (PacketXd, PacketXl) at 16 bytes. + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 template <> -EIGEN_STRONG_INLINE Packet16f plset(const float& a) { - Packet16f x{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f, a + 4.0f, a + 5.0f, a + 6.0f, a + 7.0f, - a + 8.0f, a + 9.0f, a + 10.0f, a + 11.0f, a + 12.0f, a + 13.0f, a + 14.0f, a + 15.0f}; - return x; +EIGEN_STRONG_INLINE PacketXf ploaddup(const float* from) { + return detail::ploaddup8(from); } template <> -EIGEN_STRONG_INLINE Packet8d plset(const double& a) { - return Packet8d{a + 0.0, a + 1.0, a + 2.0, a + 3.0, a + 4.0, a + 5.0, a + 6.0, a + 7.0}; +EIGEN_STRONG_INLINE PacketXd ploaddup(const double* from) { + return detail::ploaddup4(from); } template <> -EIGEN_STRONG_INLINE Packet16i plset(const int32_t& a) { - return Packet16i{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7, - a + 8, a + 9, a + 10, a + 11, a + 12, a + 13, a + 14, a + 15}; +EIGEN_STRONG_INLINE PacketXi ploaddup(const int32_t* from) { + return detail::ploaddup8(from); } template <> -EIGEN_STRONG_INLINE Packet8l plset(const int64_t& a) { - return Packet8l{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7}; +EIGEN_STRONG_INLINE PacketXl ploaddup(const int64_t* from) { + return detail::ploaddup4(from); +} +template <> +EIGEN_STRONG_INLINE PacketXf ploadquad(const float* from) { + return detail::ploadquad8(from); +} +template <> +EIGEN_STRONG_INLINE PacketXd ploadquad(const double* from) { + return detail::ploadquad4(from); +} +template <> +EIGEN_STRONG_INLINE PacketXi ploadquad(const int32_t* from) { + return detail::ploadquad8(from); +} +template <> +EIGEN_STRONG_INLINE PacketXl ploadquad(const int64_t* from) { + return detail::ploadquad4(from); } +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + template <> -EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /* unused */) { +EIGEN_STRONG_INLINE PacketXf ploaddup(const float* from) { + return detail::ploaddup16(from); +} +template <> +EIGEN_STRONG_INLINE PacketXd ploaddup(const double* from) { + return detail::ploaddup8(from); +} +template <> +EIGEN_STRONG_INLINE PacketXi ploaddup(const int32_t* from) { + return detail::ploaddup16(from); +} +template <> +EIGEN_STRONG_INLINE PacketXl ploaddup(const int64_t* from) { + return detail::ploaddup8(from); +} +template <> +EIGEN_STRONG_INLINE PacketXf ploadquad(const float* from) { + return detail::ploadquad16(from); +} +template <> +EIGEN_STRONG_INLINE PacketXd ploadquad(const double* from) { + return detail::ploadquad8(from); +} +template <> +EIGEN_STRONG_INLINE PacketXi ploadquad(const int32_t* from) { + return detail::ploadquad16(from); +} +template <> +EIGEN_STRONG_INLINE PacketXl ploadquad(const int64_t* from) { + return detail::ploadquad8(from); +} + +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + +// --- plset --- +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 + +template <> +EIGEN_STRONG_INLINE PacketXf plset(const float& a) { + return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f}; +} +template <> +EIGEN_STRONG_INLINE PacketXd plset(const double& a) { + return PacketXd{a + 0.0, a + 1.0}; +} +template <> +EIGEN_STRONG_INLINE PacketXi plset(const int32_t& a) { + return PacketXi{a + 0, a + 1, a + 2, a + 3}; +} +template <> +EIGEN_STRONG_INLINE PacketXl plset(const int64_t& a) { + return PacketXl{a + 0, a + 1}; +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE PacketXf plset(const float& a) { + return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f, a + 4.0f, a + 5.0f, a + 6.0f, a + 7.0f}; +} +template <> +EIGEN_STRONG_INLINE PacketXd plset(const double& a) { + return PacketXd{a + 0.0, a + 1.0, a + 2.0, a + 3.0}; +} +template <> +EIGEN_STRONG_INLINE PacketXi plset(const int32_t& a) { + return PacketXi{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7}; +} +template <> +EIGEN_STRONG_INLINE PacketXl plset(const int64_t& a) { + return PacketXl{a + 0, a + 1, a + 2, a + 3}; +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE PacketXf plset(const float& a) { + return PacketXf{a + 0.0f, a + 1.0f, a + 2.0f, a + 3.0f, a + 4.0f, a + 5.0f, a + 6.0f, a + 7.0f, + a + 8.0f, a + 9.0f, a + 10.0f, a + 11.0f, a + 12.0f, a + 13.0f, a + 14.0f, a + 15.0f}; +} +template <> +EIGEN_STRONG_INLINE PacketXd plset(const double& a) { + return PacketXd{a + 0.0, a + 1.0, a + 2.0, a + 3.0, a + 4.0, a + 5.0, a + 6.0, a + 7.0}; +} +template <> +EIGEN_STRONG_INLINE PacketXi plset(const int32_t& a) { + return PacketXi{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7, + a + 8, a + 9, a + 10, a + 11, a + 12, a + 13, a + 14, a + 15}; +} +template <> +EIGEN_STRONG_INLINE PacketXl plset(const int64_t& a) { + return PacketXl{a + 0, a + 1, a + 2, a + 3, a + 4, a + 5, a + 6, a + 7}; +} + +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + +// --- peven_mask --- +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 + +template <> +EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) { float kTrue = numext::bit_cast(int32_t(-1)); float kFalse = 0.0f; - return Packet16f{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, - kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse}; + return PacketXf{kTrue, kFalse, kTrue, kFalse}; } - template <> -EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /* unused */) { +EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) { double kTrue = numext::bit_cast(int64_t(-1l)); double kFalse = 0.0; - return Packet8d{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse}; + return PacketXd{kTrue, kFalse}; } +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) { + float kTrue = numext::bit_cast(int32_t(-1)); + float kFalse = 0.0f; + return PacketXf{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse}; +} +template <> +EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) { + double kTrue = numext::bit_cast(int64_t(-1l)); + double kFalse = 0.0; + return PacketXd{kTrue, kFalse, kTrue, kFalse}; +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE PacketXf peven_mask(const PacketXf& /* unused */) { + float kTrue = numext::bit_cast(int32_t(-1)); + float kFalse = 0.0f; + return PacketXf{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, + kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse}; +} +template <> +EIGEN_STRONG_INLINE PacketXd peven_mask(const PacketXd& /* unused */) { + double kTrue = numext::bit_cast(int64_t(-1l)); + double kFalse = 0.0; + return PacketXd{kTrue, kFalse, kTrue, kFalse, kTrue, kFalse, kTrue, kFalse}; +} + +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + // Helpers for ptranspose. namespace detail { template -EIGEN_ALWAYS_INLINE void zip_in_place16(Packet& p1, Packet& p2) { - Packet tmp = __builtin_shufflevector(p1, p2, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); - p2 = __builtin_shufflevector(p1, p2, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); +EIGEN_ALWAYS_INLINE void zip_in_place2(Packet& p1, Packet& p2) { + Packet tmp = __builtin_shufflevector(p1, p2, 0, 2); + p2 = __builtin_shufflevector(p1, p2, 1, 3); + p1 = tmp; +} + +template +EIGEN_ALWAYS_INLINE void zip_in_place4(Packet& p1, Packet& p2) { + Packet tmp = __builtin_shufflevector(p1, p2, 0, 4, 1, 5); + p2 = __builtin_shufflevector(p1, p2, 2, 6, 3, 7); p1 = tmp; } @@ -739,28 +986,68 @@ EIGEN_ALWAYS_INLINE void zip_in_place8(Packet& p1, Packet& p2) { p1 = tmp; } +template +EIGEN_ALWAYS_INLINE void zip_in_place16(Packet& p1, Packet& p2) { + Packet tmp = __builtin_shufflevector(p1, p2, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + p2 = __builtin_shufflevector(p1, p2, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + p1 = tmp; +} + template void zip_in_place(Packet& p1, Packet& p2); +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 template <> -EIGEN_ALWAYS_INLINE void zip_in_place(Packet16f& p1, Packet16f& p2) { - zip_in_place16(p1, p2); +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXf& p1, PacketXf& p2) { + zip_in_place4(p1, p2); } - template <> -EIGEN_ALWAYS_INLINE void zip_in_place(Packet8d& p1, Packet8d& p2) { +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXd& p1, PacketXd& p2) { + zip_in_place2(p1, p2); +} +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXi& p1, PacketXi& p2) { + zip_in_place4(p1, p2); +} +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXl& p1, PacketXl& p2) { + zip_in_place2(p1, p2); +} +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXf& p1, PacketXf& p2) { zip_in_place8(p1, p2); } - template <> -EIGEN_ALWAYS_INLINE void zip_in_place(Packet16i& p1, Packet16i& p2) { - zip_in_place16(p1, p2); +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXd& p1, PacketXd& p2) { + zip_in_place4(p1, p2); } - template <> -EIGEN_ALWAYS_INLINE void zip_in_place(Packet8l& p1, Packet8l& p2) { +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXi& p1, PacketXi& p2) { zip_in_place8(p1, p2); } +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXl& p1, PacketXl& p2) { + zip_in_place4(p1, p2); +} +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXf& p1, PacketXf& p2) { + zip_in_place16(p1, p2); +} +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXd& p1, PacketXd& p2) { + zip_in_place8(p1, p2); +} +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXi& p1, PacketXi& p2) { + zip_in_place16(p1, p2); +} +template <> +EIGEN_ALWAYS_INLINE void zip_in_place(PacketXl& p1, PacketXl& p2) { + zip_in_place8(p1, p2); +} +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES template EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { @@ -812,62 +1099,69 @@ EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { } // namespace detail -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +// ptranspose overloads: only emit valid block sizes per vector size. +// At 16 bytes: float has 4 elems, double has 2 elems. +// At 32 bytes: float has 8 elems, double has 4 elems. +// At 64 bytes: float has 16 elems, double has 8 elems. + +// All sizes support PacketBlock and PacketBlock. +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +// All sizes support PacketBlock. +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +// All sizes support PacketBlock and PacketBlock. +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +// All sizes support PacketBlock. +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 32 +// 32+ bytes: float has 8+ elems, double has 4+ elems. +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { detail::ptranspose_impl(kernel); } +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +#endif -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES >= 64 +// 64 bytes: float has 16 elems, double has 8 elems. +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { detail::ptranspose_impl(kernel); } - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { detail::ptranspose_impl(kernel); } - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { detail::ptranspose_impl(kernel); } - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - detail::ptranspose_impl(kernel); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - detail::ptranspose_impl(kernel); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - detail::ptranspose_impl(kernel); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - detail::ptranspose_impl(kernel); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - detail::ptranspose_impl(kernel); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { detail::ptranspose_impl(kernel); } #endif +#endif } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/clang/Reductions.h b/Eigen/src/Core/arch/clang/Reductions.h index defedf98d..37fc1617f 100644 --- a/Eigen/src/Core/arch/clang/Reductions.h +++ b/Eigen/src/Core/arch/clang/Reductions.h @@ -33,10 +33,10 @@ namespace internal { return __builtin_reduce_or(a != 0) != 0; \ } -EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet16f) -EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8d) -EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet16i) -EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8l) +EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXf) +EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXd) +EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXi) +EIGEN_CLANG_PACKET_REDUX_MINMAX(PacketXl) #undef EIGEN_CLANG_PACKET_REDUX_MINMAX #endif @@ -52,13 +52,38 @@ EIGEN_CLANG_PACKET_REDUX_MINMAX(Packet8l) } // __builtin_reduce_{mul,add} are only defined for integer types. -EIGEN_CLANG_PACKET_REDUX_INT(Packet16i) -EIGEN_CLANG_PACKET_REDUX_INT(Packet8l) +EIGEN_CLANG_PACKET_REDUX_INT(PacketXi) +EIGEN_CLANG_PACKET_REDUX_INT(PacketXl) #undef EIGEN_CLANG_PACKET_REDUX_INT #endif #if EIGEN_HAS_BUILTIN(__builtin_shufflevector) namespace detail { + +// Reduction helpers for different vector sizes. +// Each returns a pair of (even-sum, odd-sum) or (even-product, odd-product). + +template +EIGEN_STRONG_INLINE std::pair, scalar_type_of_vector_t> ReduceAdd2( + const VectorT& a) { + return {a[0], a[1]}; +} + +template +EIGEN_STRONG_INLINE std::pair, scalar_type_of_vector_t> ReduceAdd4( + const VectorT& a) { + const auto t1 = __builtin_shufflevector(a, a, 0, 1) + __builtin_shufflevector(a, a, 2, 3); + return {t1[0], t1[1]}; +} + +template +EIGEN_STRONG_INLINE std::pair, scalar_type_of_vector_t> ReduceAdd8( + const VectorT& a) { + const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) + __builtin_shufflevector(a, a, 4, 5, 6, 7); + const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) + __builtin_shufflevector(t1, t1, 2, 3); + return {t2[0], t2[1]}; +} + template EIGEN_STRONG_INLINE std::pair, scalar_type_of_vector_t> ReduceAdd16( const VectorT& a) { @@ -70,10 +95,23 @@ EIGEN_STRONG_INLINE std::pair, scalar_type_of_v } template -EIGEN_STRONG_INLINE std::pair, scalar_type_of_vector_t> ReduceAdd8( +EIGEN_STRONG_INLINE std::pair, scalar_type_of_vector_t> ReduceMul2( const VectorT& a) { - const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) + __builtin_shufflevector(a, a, 4, 5, 6, 7); - const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) + __builtin_shufflevector(t1, t1, 2, 3); + return {a[0], a[1]}; +} + +template +EIGEN_STRONG_INLINE std::pair, scalar_type_of_vector_t> ReduceMul4( + const VectorT& a) { + const auto t1 = __builtin_shufflevector(a, a, 0, 1) * __builtin_shufflevector(a, a, 2, 3); + return {t1[0], t1[1]}; +} + +template +EIGEN_STRONG_INLINE std::pair, scalar_type_of_vector_t> ReduceMul8( + const VectorT& a) { + const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) * __builtin_shufflevector(a, a, 4, 5, 6, 7); + const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) * __builtin_shufflevector(t1, t1, 2, 3); return {t2[0], t2[1]}; } @@ -86,57 +124,188 @@ EIGEN_STRONG_INLINE std::pair, scalar_type_of_v const auto t3 = __builtin_shufflevector(t2, t2, 0, 1) * __builtin_shufflevector(t2, t2, 2, 3); return {t3[0], t3[1]}; } - -template -EIGEN_STRONG_INLINE std::pair, scalar_type_of_vector_t> ReduceMul8( - const VectorT& a) { - const auto t1 = __builtin_shufflevector(a, a, 0, 1, 2, 3) * __builtin_shufflevector(a, a, 4, 5, 6, 7); - const auto t2 = __builtin_shufflevector(t1, t1, 0, 1) * __builtin_shufflevector(t1, t1, 2, 3); - return {t2[0], t2[1]}; -} } // namespace detail +// --- predux and predux_mul for float --- +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 + template <> -EIGEN_STRONG_INLINE float predux(const Packet16f& a) { +EIGEN_STRONG_INLINE float predux(const PacketXf& a) { + float even, odd; + std::tie(even, odd) = detail::ReduceAdd4(a); + return even + odd; +} +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) { + float even, odd; + std::tie(even, odd) = detail::ReduceMul4(a); + return even * odd; +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE float predux(const PacketXf& a) { + float even, odd; + std::tie(even, odd) = detail::ReduceAdd8(a); + return even + odd; +} +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) { + float even, odd; + std::tie(even, odd) = detail::ReduceMul8(a); + return even * odd; +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE float predux(const PacketXf& a) { float even, odd; std::tie(even, odd) = detail::ReduceAdd16(a); return even + odd; } template <> -EIGEN_STRONG_INLINE double predux(const Packet8d& a) { +EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) { + float even, odd; + std::tie(even, odd) = detail::ReduceMul16(a); + return even * odd; +} + +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + +// --- predux and predux_mul for double --- +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 + +template <> +EIGEN_STRONG_INLINE double predux(const PacketXd& a) { + double even, odd; + std::tie(even, odd) = detail::ReduceAdd2(a); + return even + odd; +} +template <> +EIGEN_STRONG_INLINE double predux_mul(const PacketXd& a) { + double even, odd; + std::tie(even, odd) = detail::ReduceMul2(a); + return even * odd; +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE double predux(const PacketXd& a) { + double even, odd; + std::tie(even, odd) = detail::ReduceAdd4(a); + return even + odd; +} +template <> +EIGEN_STRONG_INLINE double predux_mul(const PacketXd& a) { + double even, odd; + std::tie(even, odd) = detail::ReduceMul4(a); + return even * odd; +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE double predux(const PacketXd& a) { double even, odd; std::tie(even, odd) = detail::ReduceAdd8(a); return even + odd; } template <> -EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) { - float even, odd; - std::tie(even, odd) = detail::ReduceMul16(a); - return even * odd; -} -template <> -EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) { +EIGEN_STRONG_INLINE double predux_mul(const PacketXd& a) { double even, odd; std::tie(even, odd) = detail::ReduceMul8(a); return even * odd; } +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + +// --- predux for complex --- +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 + template <> -EIGEN_STRONG_INLINE std::complex predux(const Packet8cf& a) { +EIGEN_STRONG_INLINE std::complex predux(const PacketXcf& a) { + float re, im; + std::tie(re, im) = detail::ReduceAdd4(a.v); + return std::complex(re, im); +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE std::complex predux(const PacketXcf& a) { + float re, im; + std::tie(re, im) = detail::ReduceAdd8(a.v); + return std::complex(re, im); +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE std::complex predux(const PacketXcf& a) { float re, im; std::tie(re, im) = detail::ReduceAdd16(a.v); return std::complex(re, im); } +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + +// --- predux for complex --- +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 + template <> -EIGEN_STRONG_INLINE std::complex predux(const Packet4cd& a) { +EIGEN_STRONG_INLINE std::complex predux(const PacketXcd& a) { + // 1 complex double: just return it + return a[0]; +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE std::complex predux(const PacketXcd& a) { + double re, im; + std::tie(re, im) = detail::ReduceAdd4(a.v); + return std::complex(re, im); +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE std::complex predux(const PacketXcd& a) { double re, im; std::tie(re, im) = detail::ReduceAdd8(a.v); return std::complex(re, im); } +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + +// --- predux_mul for complex --- +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 + template <> -EIGEN_STRONG_INLINE std::complex predux_mul(const Packet8cf& a) { +EIGEN_STRONG_INLINE std::complex predux_mul(const PacketXcf& a) { + // 2 complex floats: just multiply them + return a[0] * a[1]; +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const PacketXcf& a) { + // 4 complex floats: split into 2+2, multiply, then scalar multiply + const Packet2cf lower2 = Packet2cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3)); + const Packet2cf upper2 = Packet2cf(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7)); + const Packet2cf prod2 = pmul(lower2, upper2); + return prod2[0] * prod2[1]; +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const PacketXcf& a) { + // 8 complex floats: 8->4->2->scalar const Packet4cf lower4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3, 4, 5, 6, 7)); const Packet4cf upper4 = Packet4cf(__builtin_shufflevector(a.v, a.v, 8, 9, 10, 11, 12, 13, 14, 15)); const Packet4cf prod4 = pmul(lower4, upper4); @@ -146,14 +315,38 @@ EIGEN_STRONG_INLINE std::complex predux_mul(const Packet8cf& a return prod2[0] * prod2[1]; } +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + +// --- predux_mul for complex --- +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 + template <> -EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cd& a) { +EIGEN_STRONG_INLINE std::complex predux_mul(const PacketXcd& a) { + // 1 complex double: just return it + return a[0]; +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const PacketXcd& a) { + // 2 complex doubles: just multiply them + return a[0] * a[1]; +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + +template <> +EIGEN_STRONG_INLINE std::complex predux_mul(const PacketXcd& a) { + // 4 complex doubles: split into 2+2, multiply, then scalar multiply const Packet2cd lower2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 0, 1, 2, 3)); const Packet2cd upper2 = Packet2cd(__builtin_shufflevector(a.v, a.v, 4, 5, 6, 7)); const Packet2cd prod2 = pmul(lower2, upper2); return prod2[0] * prod2[1]; } +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES + #endif } // end namespace internal diff --git a/Eigen/src/Core/arch/clang/TypeCasting.h b/Eigen/src/Core/arch/clang/TypeCasting.h index 87ac9ea48..75281b2a2 100644 --- a/Eigen/src/Core/arch/clang/TypeCasting.h +++ b/Eigen/src/Core/arch/clang/TypeCasting.h @@ -20,56 +20,140 @@ namespace internal { // preinterpret //============================================================================== template <> -EIGEN_STRONG_INLINE Packet16f preinterpret(const Packet16i& a) { - return reinterpret_cast(a); +EIGEN_STRONG_INLINE PacketXf preinterpret(const PacketXi& a) { + return reinterpret_cast(a); } template <> -EIGEN_STRONG_INLINE Packet16i preinterpret(const Packet16f& a) { - return reinterpret_cast(a); +EIGEN_STRONG_INLINE PacketXi preinterpret(const PacketXf& a) { + return reinterpret_cast(a); } template <> -EIGEN_STRONG_INLINE Packet8d preinterpret(const Packet8l& a) { - return reinterpret_cast(a); +EIGEN_STRONG_INLINE PacketXd preinterpret(const PacketXl& a) { + return reinterpret_cast(a); } template <> -EIGEN_STRONG_INLINE Packet8l preinterpret(const Packet8d& a) { - return reinterpret_cast(a); +EIGEN_STRONG_INLINE PacketXl preinterpret(const PacketXd& a) { + return reinterpret_cast(a); } //============================================================================== // pcast //============================================================================== #if EIGEN_HAS_BUILTIN(__builtin_convertvector) +// Float-to-int conversions: __builtin_convertvector has UB for NaN/inf/ +// out-of-range inputs. Replace NaN with 0 before converting so that +// pldexp_fast (which may pass NaN exponents) doesn't trigger UB. template <> -EIGEN_STRONG_INLINE Packet16i pcast(const Packet16f& a) { - return __builtin_convertvector(a, Packet16i); +EIGEN_STRONG_INLINE PacketXi pcast(const PacketXf& a) { + const PacketXf safe = a == a ? a : PacketXf(0); + return __builtin_convertvector(safe, PacketXi); } template <> -EIGEN_STRONG_INLINE Packet16f pcast(const Packet16i& a) { - return __builtin_convertvector(a, Packet16f); +EIGEN_STRONG_INLINE PacketXf pcast(const PacketXi& a) { + return __builtin_convertvector(a, PacketXf); } template <> -EIGEN_STRONG_INLINE Packet8l pcast(const Packet8d& a) { - return __builtin_convertvector(a, Packet8l); +EIGEN_STRONG_INLINE PacketXl pcast(const PacketXd& a) { + const PacketXd safe = a == a ? a : PacketXd(0); + return __builtin_convertvector(safe, PacketXl); } template <> -EIGEN_STRONG_INLINE Packet8d pcast(const Packet8l& a) { - return __builtin_convertvector(a, Packet8d); +EIGEN_STRONG_INLINE PacketXd pcast(const PacketXl& a) { + return __builtin_convertvector(a, PacketXd); } +// float -> double: converts lower half of floats to doubles +// double -> float: converts two PacketXd to one PacketXf +// int32 -> int64: converts lower half of int32s to int64s +// int64 -> int32: converts two PacketXl to one PacketXi + +#if EIGEN_GENERIC_VECTOR_SIZE_BYTES == 16 + +// float -> double: converts lower 2 floats to 2 doubles +template <> +EIGEN_STRONG_INLINE PacketXd pcast(const PacketXf& a) { + using HalfFloat = detail::VectorType; + HalfFloat lo = __builtin_shufflevector(a, a, 0, 1); + return __builtin_convertvector(lo, PacketXd); +} + +// double -> float: converts two PacketXd (2 doubles each) to one PacketXf (4 floats) +template <> +EIGEN_STRONG_INLINE PacketXf pcast(const PacketXd& a, const PacketXd& b) { + using HalfFloat = detail::VectorType; + HalfFloat lo = __builtin_convertvector(a, HalfFloat); + HalfFloat hi = __builtin_convertvector(b, HalfFloat); + return __builtin_shufflevector(lo, hi, 0, 1, 2, 3); +} + +// int32 -> int64: converts lower 2 int32s to 2 int64s +template <> +EIGEN_STRONG_INLINE PacketXl pcast(const PacketXi& a) { + using HalfInt = detail::VectorType; + HalfInt lo = __builtin_shufflevector(a, a, 0, 1); + return __builtin_convertvector(lo, PacketXl); +} + +// int64 -> int32: converts two PacketXl (2 int64s each) to one PacketXi (4 int32s) +template <> +EIGEN_STRONG_INLINE PacketXi pcast(const PacketXl& a, const PacketXl& b) { + using HalfInt = detail::VectorType; + HalfInt lo = __builtin_convertvector(a, HalfInt); + HalfInt hi = __builtin_convertvector(b, HalfInt); + return __builtin_shufflevector(lo, hi, 0, 1, 2, 3); +} + +#elif EIGEN_GENERIC_VECTOR_SIZE_BYTES == 32 + +// float -> double: converts lower 4 floats to 4 doubles +template <> +EIGEN_STRONG_INLINE PacketXd pcast(const PacketXf& a) { + using HalfFloat = detail::VectorType; + HalfFloat lo = __builtin_shufflevector(a, a, 0, 1, 2, 3); + return __builtin_convertvector(lo, PacketXd); +} + +// double -> float: converts two PacketXd (4 doubles each) to one PacketXf (8 floats) +template <> +EIGEN_STRONG_INLINE PacketXf pcast(const PacketXd& a, const PacketXd& b) { + using HalfFloat = detail::VectorType; + HalfFloat lo = __builtin_convertvector(a, HalfFloat); + HalfFloat hi = __builtin_convertvector(b, HalfFloat); + return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7); +} + +// int32 -> int64: converts lower 4 int32s to 4 int64s +template <> +EIGEN_STRONG_INLINE PacketXl pcast(const PacketXi& a) { + using HalfInt = detail::VectorType; + HalfInt lo = __builtin_shufflevector(a, a, 0, 1, 2, 3); + return __builtin_convertvector(lo, PacketXl); +} + +// int64 -> int32: converts two PacketXl (4 int64s each) to one PacketXi (8 int32s) +template <> +EIGEN_STRONG_INLINE PacketXi pcast(const PacketXl& a, const PacketXl& b) { + using HalfInt = detail::VectorType; + HalfInt lo = __builtin_convertvector(a, HalfInt); + HalfInt hi = __builtin_convertvector(b, HalfInt); + return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7); +} + +#else // EIGEN_GENERIC_VECTOR_SIZE_BYTES == 64 + // float -> double: converts lower 8 floats to 8 doubles template <> -EIGEN_STRONG_INLINE Packet8d pcast(const Packet16f& a) { +EIGEN_STRONG_INLINE PacketXd pcast(const PacketXf& a) { using HalfFloat = detail::VectorType; HalfFloat lo = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7); - return __builtin_convertvector(lo, Packet8d); + return __builtin_convertvector(lo, PacketXd); } -// double -> float: converts two Packet8d to one Packet16f +// double -> float: converts two PacketXd to one PacketXf template <> -EIGEN_STRONG_INLINE Packet16f pcast(const Packet8d& a, const Packet8d& b) { +EIGEN_STRONG_INLINE PacketXf pcast(const PacketXd& a, const PacketXd& b) { using HalfFloat = detail::VectorType; HalfFloat lo = __builtin_convertvector(a, HalfFloat); HalfFloat hi = __builtin_convertvector(b, HalfFloat); @@ -78,20 +162,22 @@ EIGEN_STRONG_INLINE Packet16f pcast(const Packet8d& a, cons // int32 -> int64: converts lower 8 int32s to 8 int64s template <> -EIGEN_STRONG_INLINE Packet8l pcast(const Packet16i& a) { +EIGEN_STRONG_INLINE PacketXl pcast(const PacketXi& a) { using HalfInt = detail::VectorType; HalfInt lo = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7); - return __builtin_convertvector(lo, Packet8l); + return __builtin_convertvector(lo, PacketXl); } -// int64 -> int32: converts two Packet8l to one Packet16i +// int64 -> int32: converts two PacketXl to one PacketXi template <> -EIGEN_STRONG_INLINE Packet16i pcast(const Packet8l& a, const Packet8l& b) { +EIGEN_STRONG_INLINE PacketXi pcast(const PacketXl& a, const PacketXl& b) { using HalfInt = detail::VectorType; HalfInt lo = __builtin_convertvector(a, HalfInt); HalfInt hi = __builtin_convertvector(b, HalfInt); return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } + +#endif // EIGEN_GENERIC_VECTOR_SIZE_BYTES #endif } // end namespace internal diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d167710bf..916ac21ec 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -186,6 +186,17 @@ ei_add_test(mixingtypes) ei_add_test(float_conversion) ei_add_test(io) ei_add_test(packetmath "-DEIGEN_FAST_MATH=1") +# Generic clang vector backend tests for different vector sizes. +include(CheckCXXSourceCompiles) +check_cxx_source_compiles(" + typedef float v4sf __attribute__((ext_vector_type(4))); + int main() { return __builtin_vectorelements(v4sf{}); } +" COMPILER_SUPPORTS_VECTOR_EXTENSIONS) +if(COMPILER_SUPPORTS_VECTOR_EXTENSIONS) + ei_add_test(packetmath_generic_16 "-DEIGEN_FAST_MATH=1") + ei_add_test(packetmath_generic_32 "-DEIGEN_FAST_MATH=1") + ei_add_test(packetmath_generic_64 "-DEIGEN_FAST_MATH=1") +endif() ei_add_test(packet_segment) ei_add_test(vectorization_logic) ei_add_test(basicstuff) diff --git a/test/packetmath_generic_16.cpp b/test/packetmath_generic_16.cpp new file mode 100644 index 000000000..612a75c65 --- /dev/null +++ b/test/packetmath_generic_16.cpp @@ -0,0 +1,4 @@ +// Force the generic clang vector backend with 16-byte vectors. +#define EIGEN_VECTORIZE_GENERIC 1 +#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 16 +#include "packetmath.cpp" diff --git a/test/packetmath_generic_32.cpp b/test/packetmath_generic_32.cpp new file mode 100644 index 000000000..9816f9fa5 --- /dev/null +++ b/test/packetmath_generic_32.cpp @@ -0,0 +1,4 @@ +// Force the generic clang vector backend with 32-byte vectors. +#define EIGEN_VECTORIZE_GENERIC 1 +#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 32 +#include "packetmath.cpp" diff --git a/test/packetmath_generic_64.cpp b/test/packetmath_generic_64.cpp new file mode 100644 index 000000000..69575449e --- /dev/null +++ b/test/packetmath_generic_64.cpp @@ -0,0 +1,4 @@ +// Force the generic clang vector backend with 64-byte vectors. +#define EIGEN_VECTORIZE_GENERIC 1 +#define EIGEN_GENERIC_VECTOR_SIZE_BYTES 64 +#include "packetmath.cpp"