From 2c6b61c0062375668d6a2b46afb38b8d8d3a6f71 Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Mon, 22 Jan 2024 21:23:21 +0000 Subject: [PATCH] Add half and quarter vector support to HVX architecture --- Eigen/Core | 4 - .../Core/arch/HVX/GeneralBlockPanelKernel.h | 41 - Eigen/src/Core/arch/HVX/PacketMath.h | 807 ++++++++++++++++-- 3 files changed, 724 insertions(+), 128 deletions(-) delete mode 100644 Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h diff --git a/Eigen/Core b/Eigen/Core index 39f2b3fd0..f9d9974b0 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -378,10 +378,6 @@ using std::ptrdiff_t; #include "src/Core/arch/AVX512/GemmKernel.h" #endif -#if defined(EIGEN_VECTORIZE_HVX) -#include "src/Core/arch/HVX/GeneralBlockPanelKernel.h" -#endif - #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" #include "src/Core/PartialReduxEvaluator.h" diff --git a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h deleted file mode 100644 index a15973959..000000000 --- a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef EIGEN_HVX_GENERAL_BLOCK_KERNEL_H -#define EIGEN_HVX_GENERAL_BLOCK_KERNEL_H - -// Only support 128B HVX now. -// Floating-point operations are only supported since V68. -#if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68 - -namespace Eigen { -namespace internal { - -template -class gebp_traits - : public gebp_traits { - public: - typedef Packet32qf AccPacket; - - EIGEN_STRONG_INLINE void initAcc(Packet32qf& p) { p = pzero(p); } - - template - EIGEN_STRONG_INLINE void madd(const Packet32f& a, const Packet32f& b, Packet32qf& c, Packet32f& /*tmp*/, - const LaneIdType&) const { - c = pmadd_f32_to_qf32(a, b, c); - } - - template - EIGEN_STRONG_INLINE void madd(const Packet32f& a, const QuadPacket& b, Packet32qf& c, Packet32f& tmp, - const LaneIdType& lane) const { - madd(a, b.get(lane), c, tmp, lane); - } - - EIGEN_STRONG_INLINE void acc(const Packet32qf& c, const Packet32f& alpha, Packet32f& r) const { - r = pmadd_qf32_to_f32(c, alpha, r); - } -}; - -} // end namespace internal -} // end namespace Eigen - -#endif // __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68 - -#endif // EIGEN_HVX_GENERAL_BLOCK_KERNEL_H diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index 7c69f3b46..7e139de13 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -18,18 +18,107 @@ namespace Eigen { namespace internal { -EIGEN_STRONG_INLINE HVX_Vector HVX_load(const void* mem) { return *((HVX_Vector*)mem); } +// HVX utilities. -EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const void* mem) { return *((HVX_UVector*)mem); } +template +EIGEN_STRONG_INLINE HVX_Vector HVX_vmem(const void* m) { + HVX_Vector v; +#if EIGEN_COMP_CLANG + // Use inlined assembly for aligned vmem load on unaligned memory. + // Use type cast to HVX_Vector* may mess up with compiler data alignment. + __asm__("%0 = vmem(%1+#%2)" : "=v"(v) : "r"(m), "i"(D) : "memory"); +#else + void* aligned_mem = + reinterpret_cast((reinterpret_cast(m) & ~(__HVX_LENGTH__ - 1)) + D * __HVX_LENGTH__); + memcpy(&v, aligned_mem, __HVX_LENGTH__); +#endif + return v; +} -EIGEN_STRONG_INLINE void HVX_store(void* mem, HVX_Vector v) { *((HVX_Vector*)mem) = v; } +template +EIGEN_STRONG_INLINE HVX_Vector HVX_load(const T* mem) { + HVX_Vector v; + memcpy(&v, reinterpret_cast(mem), __HVX_LENGTH__); + return v; +} -EIGEN_STRONG_INLINE void HVX_storeu(void* mem, HVX_Vector v) { *((HVX_UVector*)mem) = v; } +template +EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const T* mem) { + HVX_Vector v; + memcpy(&v, mem, __HVX_LENGTH__); + return v; +} + +template +EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(const T* mem) { +#if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD) + // Fast partial vector load through aligned vmem load. + // The load may past end of array but is aligned to prevent memory fault. + HVX_Vector v0 = HVX_vmem<0>(mem); + HVX_Vector v1 = v0; + uintptr_t mem_addr = reinterpret_cast(mem); + EIGEN_IF_CONSTEXPR(Size * sizeof(T) <= Alignment) { + // Data size less than alignment will never cross multiple aligned vectors. + v1 = v0; + } + else { + uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1); + if (left_off + Size * sizeof(T) > __HVX_LENGTH__) { + v1 = HVX_vmem<1>(mem); + } else { + v1 = v0; + } + } + return Q6_V_valign_VVR(v1, v0, mem_addr); +#else + HVX_Vector v; + memcpy(&v, mem, Size * sizeof(T)); + return v; +#endif +} + +template +EIGEN_STRONG_INLINE void HVX_store(T* mem, HVX_Vector v) { + memcpy(reinterpret_cast(mem), &v, __HVX_LENGTH__); +} + +template +EIGEN_STRONG_INLINE void HVX_storeu(T* mem, HVX_Vector v) { + memcpy(mem, &v, __HVX_LENGTH__); +} + +template +EIGEN_STRONG_INLINE void HVX_store_partial(T* mem, HVX_Vector v) { + uintptr_t mem_addr = reinterpret_cast(mem); + HVX_Vector value = Q6_V_vlalign_VVR(v, v, mem_addr); + uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1); + uintptr_t right_off = left_off + Size * sizeof(T); + + HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr); + HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off); + + EIGEN_IF_CONSTEXPR(Size * sizeof(T) > Alignment) { + if (right_off > __HVX_LENGTH__) { + Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ / sizeof(T), value); + qr = Q6_Q_vcmp_eq_VbVb(value, value); + } + } + + ql_not = Q6_Q_or_QQn(ql_not, qr); + Q6_vmem_QnRIV(ql_not, mem, value); +} + +// Packet definitions. +enum class HVXPacketSize { + Full, + Half, + Quarter, +}; // Hexagon compiler uses same HVX_Vector to represent all HVX vector types. // Wrap different vector type (float32, int32, etc) to different class with // explicit constructor and casting back-and-force to HVX_Vector. -template +template class HVXPacket { public: HVXPacket() = default; @@ -41,24 +130,62 @@ class HVXPacket { HVX_Vector m_val = Q6_V_vzero(); }; -typedef HVXPacket<0> Packet32f; // float32 -typedef HVXPacket<1> Packet32qf; // qfloat32 +typedef HVXPacket Packet32f; +typedef HVXPacket Packet16f; +typedef HVXPacket Packet8f; +// Packet traits. template <> struct packet_traits : default_packet_traits { typedef Packet32f type; - typedef Packet32f half; + typedef Packet16f half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 32, + + HasCmp = 1, + HasAdd = 1, + HasSub = 1, + HasShift = 0, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 0, + HasAbsDiff = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 0, + HasFloor = 0, + HasCeil = 0, + HasRint = 0, + + HasSin = 0, + HasCos = 0, + HasACos = 0, + HasASin = 0, + HasATan = 0, + HasATanh = 0, + HasLog = 0, + HasExp = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasTanh = 0, + HasErf = 0, + HasBessel = 0, + HasNdtri = 0 }; }; template <> struct unpacket_traits { typedef float type; - typedef Packet32f half; + typedef Packet16f half; enum { size = 32, alignment = Aligned128, @@ -68,94 +195,326 @@ struct unpacket_traits { }; }; -// float32 operations. template <> -EIGEN_STRONG_INLINE Packet32f pset1(const float& from) { +struct unpacket_traits { + typedef float type; + typedef Packet8f half; + enum { + size = 16, + // Many code assume alignment on packet size instead of following trait + // So we do not use Aligned128 to optimize aligned load/store, + alignment = Aligned64, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef Packet8f half; + enum { + size = 8, + // Many code assume alignment on packet size instead of following trait + // So we do not use Aligned128 to optimize aligned load/store, + alignment = Aligned32, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +// float32 operations. +template +EIGEN_STRONG_INLINE HVXPacket pzero_hvx(const HVXPacket&) { + return HVXPacket::Create(Q6_V_vzero()); +} +template <> +EIGEN_STRONG_INLINE Packet32f pzero(const Packet32f&) { + return pzero_hvx(Packet32f()); +} +template <> +EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f&) { + return pzero_hvx(Packet16f()); +} +template <> +EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f&) { + return pzero_hvx(Packet8f()); +} + +template +EIGEN_STRONG_INLINE typename unpacket_traits>::half predux_half_dowto4_hvx(const HVXPacket& a) { + const Index packet_size = unpacket_traits>::size; + return unpacket_traits>::half::Create( + Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), sizeof(float) * packet_size / 2), a.Get()))); +} +template <> +EIGEN_STRONG_INLINE Packet16f predux_half_dowto4(const Packet32f& a) { + return predux_half_dowto4_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) { + return predux_half_dowto4_hvx(a); +} + +template +EIGEN_STRONG_INLINE HVXPacket pset1_hvx(const float& from) { union { float f; int32_t i; } u; u.f = from; - return Packet32f::Create(Q6_V_vsplat_R(u.i)); + return HVXPacket::Create(Q6_V_vsplat_R(u.i)); +} +template <> +EIGEN_STRONG_INLINE Packet32f pset1(const float& from) { + return pset1_hvx(from); +} +template <> +EIGEN_STRONG_INLINE Packet16f pset1(const float& from) { + return pset1_hvx(from); +} +template <> +EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { + return pset1_hvx(from); } template <> EIGEN_STRONG_INLINE Packet32f pload(const float* from) { return Packet32f::Create(HVX_load(from)); } +template <> +EIGEN_STRONG_INLINE Packet16f pload(const float* from) { + return Packet16f::Create( + HVX_load_partial::size, unpacket_traits::alignment>(from)); +} +template <> +EIGEN_STRONG_INLINE Packet8f pload(const float* from) { + return Packet8f::Create( + HVX_load_partial::size, unpacket_traits::alignment>(from)); +} + template <> EIGEN_STRONG_INLINE Packet32f ploadu(const float* from) { return Packet32f::Create(HVX_loadu(from)); } +template <> +EIGEN_STRONG_INLINE Packet16f ploadu(const float* from) { + return Packet16f::Create(HVX_load_partial::size, 0>(from)); +} +template <> +EIGEN_STRONG_INLINE Packet8f ploadu(const float* from) { + return Packet8f::Create(HVX_load_partial::size, 0>(from)); +} template <> EIGEN_STRONG_INLINE void pstore(float* to, const Packet32f& from) { HVX_store(to, from.Get()); } +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const Packet16f& from) { + HVX_store_partial::size, unpacket_traits::alignment>(to, from.Get()); +} +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const Packet8f& from) { + HVX_store_partial::size, unpacket_traits::alignment>(to, from.Get()); +} + template <> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet32f& from) { HVX_storeu(to, from.Get()); } +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet16f& from) { + HVX_store_partial::size, 0>(to, from.Get()); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from) { + HVX_store_partial::size, 0>(to, from.Get()); +} +template +EIGEN_STRONG_INLINE HVXPacket pmul_hvx(const HVXPacket& a, const HVXPacket& b) { + return HVXPacket::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()))); +} template <> EIGEN_STRONG_INLINE Packet32f pmul(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()))); + return pmul_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pmul(const Packet16f& a, const Packet16f& b) { + return pmul_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pmul(const Packet8f& a, const Packet8f& b) { + return pmul_hvx(a, b); } +template +EIGEN_STRONG_INLINE HVXPacket padd_hvx(const HVXPacket& a, const HVXPacket& b) { + return HVXPacket::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get()))); +} template <> EIGEN_STRONG_INLINE Packet32f padd(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get()))); + return padd_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f padd(const Packet16f& a, const Packet16f& b) { + return padd_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { + return padd_hvx(a, b); } +template +EIGEN_STRONG_INLINE HVXPacket psub_hvx(const HVXPacket& a, const HVXPacket& b) { + return HVXPacket::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get()))); +} template <> EIGEN_STRONG_INLINE Packet32f psub(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get()))); + return psub_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f psub(const Packet16f& a, const Packet16f& b) { + return psub_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { + return psub_hvx(a, b); } +template +EIGEN_STRONG_INLINE HVXPacket pnegate_hvx(const HVXPacket& a) { + return HVXPacket::Create(a.Get() ^ Q6_V_vsplat_R(0x80000000)); +} template <> EIGEN_STRONG_INLINE Packet32f pnegate(const Packet32f& a) { - return psub(Packet32f::Create(Q6_V_vzero()), a); + return pnegate_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { + return pnegate_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) { + return pnegate_hvx(a); } -template <> -EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) { +template +EIGEN_STRONG_INLINE HVXPacket pcmp_le_hvx(const HVXPacket& a, const HVXPacket& b) { HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true)); + return HVXPacket::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true)); +} +template <> +EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) { + return pcmp_le_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) { + return pcmp_le_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { + return pcmp_le_hvx(a, b); } -template <> -EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) { +template +EIGEN_STRONG_INLINE HVXPacket pcmp_eq_hvx(const HVXPacket& a, const HVXPacket& b) { HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); + return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); +} +template <> +EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) { + return pcmp_eq_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { + return pcmp_eq_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { + return pcmp_eq_hvx(a, b); } +template +EIGEN_STRONG_INLINE HVXPacket pcmp_lt_hvx(const HVXPacket& a, const HVXPacket& b) { + HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); + HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); + return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); +} template <> EIGEN_STRONG_INLINE Packet32f pcmp_lt(const Packet32f& a, const Packet32f& b) { - HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); - HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); + return pcmp_lt_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) { + return pcmp_lt_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { + return pcmp_lt_hvx(a, b); } +template +EIGEN_STRONG_INLINE HVXPacket pcmp_lt_or_nan_hvx(const HVXPacket& a, const HVXPacket& b) { + HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); + HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); + return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); +} template <> EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) { - HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); - HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); + return pcmp_lt_or_nan_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { + return pcmp_lt_or_nan_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { + return pcmp_lt_or_nan_hvx(a, b); } +template +EIGEN_STRONG_INLINE HVXPacket pabs_hvx(const HVXPacket& a) { + return HVXPacket::Create(a.Get() & Q6_V_vsplat_R(0x7FFFFFFF)); +} template <> EIGEN_STRONG_INLINE Packet32f pabs(const Packet32f& a) { - HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), Q6_V_vzero()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, a.Get(), pnegate(a).Get())); + return pabs_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) { + return pabs_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) { + return pabs_hvx(a); } +template +EIGEN_STRONG_INLINE float pfirst_hvx(const HVXPacket& a) { + union { + float array[1]; + HVX_Vector vector; + } HVX_and_array; + HVX_and_array.vector = a.Get(); + return HVX_and_array.array[0]; +} template <> EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) { - float vsf[32] __attribute__((aligned(128))); - pstore(vsf, a); - return vsf[0]; + return pfirst_hvx(a); +} +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) { + return pfirst_hvx(a); +} +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) { + return pfirst_hvx(a); } EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { @@ -166,13 +525,107 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { // Shuffle the 64-bit lanes. HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8); HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8); - kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0)); kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0)); kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_3_2)); kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_3_2)); } +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + // Shuffle the 32-bit lanes. + HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4); + HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4); + // Shuffle the 64-bit lanes. + HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8); + + kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0)); + kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64)); + kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0)); + kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64)); +} +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + // Shuffle the 32-bit lanes. + HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4); + HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4); + + // Shuffle the 64-bit lanes. + HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8); + + kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_1_1_0)); + kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 32)); + kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64)); + kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 96)); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + // Shuffle the 32-bit lanes. + HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4); + HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4); + HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4); + HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4); + + // Shuffle the 64-bit lanes. + HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8); + HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8); + + // Shuffle the 128-bit lanes. + v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16); + + kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_0_1_0)); + kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 32)); + kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 64)); + kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 96)); + kernel.packet[4] = Packet8f::Create(HEXAGON_HVX_GET_V1(v_0_1_0)); + kernel.packet[5] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 32)); + kernel.packet[6] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 64)); + kernel.packet[7] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 96)); +} +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + // Shuffle the 32-bit lanes. + HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4); + HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4); + HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4); + HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4); + HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4); + HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4); + HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4); + HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4); + + // Shuffle the 64-bit lanes. + HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8); + HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8); + HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8); + HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8); + + // Shuffle the 128-bit lanes. + v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16); + v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_1_0), -16); + v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_5_4), -16); + v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_5_4), -16); + + // Shuffle the 256-bit lanes. + v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32); + v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32); + v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32); + v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32); + + kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0)); + kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64)); + kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0)); + kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64)); + kernel.packet[4] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_3_2)); + kernel.packet[5] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_3_2), 64)); + kernel.packet[6] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_3_2)); + kernel.packet[7] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_3_2), 64)); + kernel.packet[8] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_5_4)); + kernel.packet[9] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_5_4), 64)); + kernel.packet[10] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_5_4)); + kernel.packet[11] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_5_4), 64)); + kernel.packet[12] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_7_6)); + kernel.packet[13] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_7_6), 64)); + kernel.packet[14] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_7_6)); + kernel.packet[15] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_7_6), 64)); +} EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { // Shuffle the 32-bit lanes. HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4); @@ -298,29 +751,67 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_31_30)); } +template +EIGEN_STRONG_INLINE float predux_hvx(const HVXPacket& a) { + const Index packet_size = unpacket_traits>::size; + HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float))); + for (int i = 2; i < packet_size; i <<= 1) { + vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i * sizeof(float))); + } + return pfirst(HVXPacket::Create(Q6_Vsf_equals_Vqf32(vsum))); +} template <> EIGEN_STRONG_INLINE float predux(const Packet32f& a) { - HVX_Vector vsum_4 = Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), 4), a.Get()); - HVX_Vector vsum_8 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_4, 8), vsum_4); - HVX_Vector vsum_16 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_8, 16), vsum_8); - HVX_Vector vsum_32 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_16, 32), vsum_16); - HVX_Vector vsum_64 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_32, 64), vsum_32); - return pfirst(Packet32f::Create(Q6_Vsf_equals_Vqf32(vsum_64))); + return predux_hvx(a); +} +template <> +EIGEN_STRONG_INLINE float predux(const Packet16f& a) { + return predux_hvx(a); +} +template <> +EIGEN_STRONG_INLINE float predux(const Packet8f& a) { + return predux_hvx(a); } +template +EIGEN_STRONG_INLINE HVXPacket ploaddup_hvx(const float* from) { + constexpr Index size = unpacket_traits>::size / 2; + HVX_Vector load = HVX_load_partial(from); + HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4); + return HVXPacket::Create(HEXAGON_HVX_GET_V0(dup)); +} template <> EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) { - HVX_Vector load = HVX_loadu(from); - HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4); - return Packet32f::Create(HEXAGON_HVX_GET_V0(dup)); + return ploaddup_hvx(from); +} +template <> +EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) { + return ploaddup_hvx(from); +} +template <> +EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) { + return ploaddup_hvx(from); } -template <> -EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) { - HVX_Vector load = HVX_loadu(from); +template +EIGEN_STRONG_INLINE HVXPacket ploadquad_hvx(const float* from) { + constexpr Index size = unpacket_traits>::size / 4; + HVX_Vector load = HVX_load_partial(from); HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4); HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8); - return Packet32f::Create(HEXAGON_HVX_GET_V0(quad)); + return HVXPacket::Create(HEXAGON_HVX_GET_V0(quad)); +} +template <> +EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) { + return ploadquad_hvx(from); +} +template <> +EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) { + return ploadquad_hvx(from); +} +template <> +EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) { + return ploadquad_hvx(from); } template <> @@ -330,99 +821,249 @@ EIGEN_STRONG_INLINE Packet32f preverse(const Packet32f& a) { } template <> -EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get())); +EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) { + HVX_Vector delta = Q6_Vb_vsplat_R(0x3c); + return Packet16f::Create(Q6_V_vdelta_VV(a.Get(), delta)); } +template <> +EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) { + HVX_Vector delta = Q6_Vb_vsplat_R(0x1c); + return Packet8f::Create(Q6_V_vdelta_VV(a.Get(), delta)); +} + +template +EIGEN_STRONG_INLINE HVXPacket pmin_hvx(const HVXPacket& a, const HVXPacket& b) { + return HVXPacket::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get())); +} +template <> +EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) { + return pmin_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) { + return pmin_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { + return pmin_hvx(a, b); +} + +template +EIGEN_STRONG_INLINE HVXPacket pmax_hvx(const HVXPacket& a, const HVXPacket& b) { + return HVXPacket::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get())); +} template <> EIGEN_STRONG_INLINE Packet32f pmax(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get())); + return pmax_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) { + return pmax_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { + return pmax_hvx(a, b); } +template +EIGEN_STRONG_INLINE HVXPacket pand_hvx(const HVXPacket& a, const HVXPacket& b) { + return HVXPacket::Create(a.Get() & b.Get()); +} template <> EIGEN_STRONG_INLINE Packet32f pand(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(a.Get() & b.Get()); + return pand_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, const Packet16f& b) { + return pand_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { + return pand_hvx(a, b); } +template +EIGEN_STRONG_INLINE HVXPacket por_hvx(const HVXPacket& a, const HVXPacket& b) { + return HVXPacket::Create(a.Get() | b.Get()); +} template <> EIGEN_STRONG_INLINE Packet32f por(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(a.Get() | b.Get()); + return por_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) { + return por_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) { + return por_hvx(a, b); } +template +EIGEN_STRONG_INLINE HVXPacket pxor_hvx(const HVXPacket& a, const HVXPacket& b) { + return HVXPacket::Create(a.Get() ^ b.Get()); +} template <> EIGEN_STRONG_INLINE Packet32f pxor(const Packet32f& a, const Packet32f& b) { - return Packet32f::Create(a.Get() ^ b.Get()); + return pxor_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) { + return pxor_hvx(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) { + return pxor_hvx(a, b); } +template +EIGEN_STRONG_INLINE HVXPacket pnot_hvx(const HVXPacket& a) { + return HVXPacket::Create(~a.Get()); +} template <> EIGEN_STRONG_INLINE Packet32f pnot(const Packet32f& a) { - return Packet32f::Create(~a.Get()); + return pnot_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet16f pnot(const Packet16f& a) { + return pnot_hvx(a); +} +template <> +EIGEN_STRONG_INLINE Packet8f pnot(const Packet8f& a) { + return pnot_hvx(a); } +template +EIGEN_STRONG_INLINE HVXPacket pselect_hvx(const HVXPacket& mask, const HVXPacket& a, const HVXPacket& b) { + HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero()); + return HVXPacket::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get())); +} template <> EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, const Packet32f& b) { - HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get())); + return pselect_hvx(mask, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) { + return pselect_hvx(mask, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) { + return pselect_hvx(mask, a, b); } -template -EIGEN_STRONG_INLINE float predux_generic(const Packet32f& a, Op op) { - Packet32f vredux_4 = op(Packet32f::Create(Q6_V_vror_VR(a.Get(), 4)), a); - Packet32f vredux_8 = op(Packet32f::Create(Q6_V_vror_VR(vredux_4.Get(), 8)), vredux_4); - Packet32f vredux_16 = op(Packet32f::Create(Q6_V_vror_VR(vredux_8.Get(), 16)), vredux_8); - Packet32f vredux_32 = op(Packet32f::Create(Q6_V_vror_VR(vredux_16.Get(), 32)), vredux_16); - Packet32f vredux_64 = op(Packet32f::Create(Q6_V_vror_VR(vredux_32.Get(), 64)), vredux_32); - return pfirst(vredux_64); +template +EIGEN_STRONG_INLINE float predux_generic(const HVXPacket& a, Op op) { + const Index packet_size = unpacket_traits>::size; + HVXPacket vredux = a; + for (int i = 1; i < packet_size; i <<= 1) { + vredux = op(vredux, HVXPacket::Create(Q6_V_vror_VR(vredux.Get(), i * sizeof(float)))); + } + return pfirst(vredux); } template <> EIGEN_STRONG_INLINE float predux_max(const Packet32f& a) { return predux_generic(a, pmax); } +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) { + return predux_generic(a, pmax); +} +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) { + return predux_generic(a, pmax); +} template <> EIGEN_STRONG_INLINE float predux_min(const Packet32f& a) { return predux_generic(a, pmin); } +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { + return predux_generic(a, pmin); +} +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) { + return predux_generic(a, pmin); +} template <> EIGEN_STRONG_INLINE bool predux_any(const Packet32f& a) { return predux_generic(a, por) != 0.0f; } +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) { + return predux_generic(a, por) != 0.0f; +} +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) { + return predux_generic(a, por) != 0.0f; +} static const float index_vsf[32] - __attribute__((aligned(128))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + __attribute__((aligned(__HVX_LENGTH__))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; +template +EIGEN_STRONG_INLINE HVXPacket plset_hvx(const float& a) { + return padd(pload>(index_vsf), pset1>(a)); +} template <> EIGEN_STRONG_INLINE Packet32f plset(const float& a) { - return padd(pload(index_vsf), pset1(a)); + return plset_hvx(a); } - -// qfloat32 operations. template <> -EIGEN_STRONG_INLINE Packet32qf pzero(const Packet32qf&) { - return Packet32qf::Create(Q6_V_vzero()); +EIGEN_STRONG_INLINE Packet16f plset(const float& a) { + return plset_hvx(a); } - template <> -EIGEN_STRONG_INLINE Packet32qf pmul(const Packet32qf& a, const Packet32qf& b) { - return Packet32qf::Create(Q6_Vqf32_vmpy_Vqf32Vqf32(a.Get(), b.Get())); +EIGEN_STRONG_INLINE Packet8f plset(const float& a) { + return plset_hvx(a); } +template +EIGEN_STRONG_INLINE void pscatter_hvx(float* to, const HVXPacket& from, Index stride) { + const Index packet_size = unpacket_traits>::size; + float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__))); + pstore(elements, from); + for (Index i = 0; i < packet_size; ++i) { + to[i * stride] = elements[i]; + } +} template <> -EIGEN_STRONG_INLINE Packet32qf padd(const Packet32qf& a, const Packet32qf& b) { - return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(a.Get(), b.Get())); +EIGEN_STRONG_INLINE void pscatter(float* to, const Packet32f& from, Index stride) { + pscatter_hvx(to, from, stride); +} +template <> +EIGEN_STRONG_INLINE void pscatter(float* to, const Packet16f& from, Index stride) { + pscatter_hvx(to, from, stride); +} +template <> +EIGEN_STRONG_INLINE void pscatter(float* to, const Packet8f& from, Index stride) { + pscatter_hvx(to, from, stride); } -// Mixed float32 and qfloat32 operations. -EIGEN_STRONG_INLINE Packet32qf pmadd_f32_to_qf32(const Packet32f& a, const Packet32f& b, const Packet32qf& c) { - return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()), c.Get())); +template +EIGEN_STRONG_INLINE HVXPacket pgather_hvx(const float* from, Index stride) { + const Index packet_size = unpacket_traits>::size; + float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__))); + for (Index i = 0; i < packet_size; i++) { + elements[i] = from[i * stride]; + } + return pload>(elements); } - -EIGEN_STRONG_INLINE Packet32f pmadd_qf32_to_f32(const Packet32qf& a, const Packet32f& b, const Packet32f& c) { - return Packet32f::Create(Q6_Vsf_equals_Vqf32( - Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(a.Get()), b.Get()), c.Get()))); +template <> +EIGEN_STRONG_INLINE Packet32f pgather(const float* from, Index stride) { + return pgather_hvx(from, stride); +} +template <> +EIGEN_STRONG_INLINE Packet16f pgather(const float* from, Index stride) { + return pgather_hvx(from, stride); +} +template <> +EIGEN_STRONG_INLINE Packet8f pgather(const float* from, Index stride) { + return pgather_hvx(from, stride); } } // end namespace internal