From 8290e21fb5c96f8d9f18b419eb0c17ae99c05f23 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 17 Nov 2016 13:21:15 -0500 Subject: [PATCH 01/19] replace sizeof(Packet) with PacketSize else it breaks for ZVector.Packet4f --- Eigen/src/Core/arch/ZVector/Complex.h | 230 +++++++++++- Eigen/src/Core/arch/ZVector/MathFunctions.h | 69 ++-- Eigen/src/Core/arch/ZVector/PacketMath.h | 386 +++++++++++++++++++- test/packetmath.cpp | 2 +- 4 files changed, 646 insertions(+), 41 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index e9d83eca6..d39d2d105 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2010 Gael Guennebaud +// Copyright (C) 2016 Konstantinos Margaritis // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -24,13 +25,48 @@ struct Packet1cd Packet2d v; }; +struct Packet2cf +{ + EIGEN_STRONG_INLINE Packet2cf() {} + EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} + union { + Packet4f v; + Packet1cd cd[2]; + }; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet2cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasBlend = 1, + HasSetLinear = 0 + }; +}; + + template<> struct packet_traits > : default_packet_traits { typedef Packet1cd type; typedef Packet1cd half; enum { Vectorizable = 1, - AlignedOnScalar = 0, + AlignedOnScalar = 1, size = 1, HasHalfPacket = 0, @@ -47,20 +83,68 @@ template<> struct packet_traits > : default_packet_traits }; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; +/* Forward declaration */ +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); + +template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + Packet2cf res; + res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); + res.cd[1] = res.cd[0]; + return res; +} +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride EIGEN_UNUSED) +{ + return pload(from); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + pstore >((std::complex *) af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride EIGEN_UNUSED) +{ + pstore >(to, from); +} + +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); } +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) +{ + Packet2cf res; + res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; + res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; + return res; +} template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { @@ -79,43 +163,90 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con return Packet1cd(v1 + v2); } - -template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } - -template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { - return pset1(*from); + Packet2cf res; + res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; + res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; + return res; } +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - std::complex EIGEN_ALIGN16 res[2]; - pstore >(res, a); + std::complex EIGEN_ALIGN16 res; + pstore >(&res, a); + + return res; +} +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +{ + std::complex EIGEN_ALIGN16 res[2]; + pstore >(res, a); return res[0]; } template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet2cf res; + res.cd[0] = a.cd[1]; + res.cd[1] = a.cd[0]; + return res; +} template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = padd(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + PacketBlock transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + ptranspose(transpose); + + return padd(transpose.packet[0], transpose.packet[1]); +} template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = pmul(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} template struct palign_impl @@ -127,6 +258,18 @@ struct palign_impl } }; +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset == 1) { + first.cd[0] = first.cd[1]; + first.cd[1] = second.cd[0]; + } + } +}; + template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const @@ -160,6 +303,39 @@ template<> struct conj_helper } }; +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for AltiVec @@ -168,17 +344,49 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, con return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); } +template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for AltiVec + Packet2cf res; + res.cd[0] = pdiv(a.cd[0], b.cd[0]); + res.cd[1] = pdiv(a.cd[1], b.cd[1]); + return res; +} + EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) { return Packet1cd(preverse(Packet2d(x.v))); } +EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) +{ + Packet2cf res; + res.cd[0] = pcplxflip(x.cd[0]); + res.cd[1] = pcplxflip(x.cd[1]); + return res; +} + EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); kernel.packet[0].v = tmp; } + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet1cd tmp = kernel.packet[0].cd[1]; + kernel.packet[0].cd[1] = kernel.packet[1].cd[0]; + kernel.packet[1].cd[0] = tmp; +} + +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + Packet2cf result; + const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] }; + result.v = pblend(ifPacket4, thenPacket.v, elsePacket.v); + return result; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h index 6fff8524e..5c7aa7256 100644 --- a/Eigen/src/Core/arch/ZVector/MathFunctions.h +++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h @@ -3,6 +3,7 @@ // // Copyright (C) 2007 Julien Pommier // Copyright (C) 2009 Gael Guennebaud +// Copyright (C) 2016 Konstantinos Margaritis // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -19,32 +20,32 @@ namespace Eigen { namespace internal { +static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); +static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); +static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + +static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); +static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& _x) { Packet2d x = _x; - _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); - _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); - _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - - _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - Packet2d tmp, fx; Packet2l emm0; @@ -91,18 +92,44 @@ Packet2d pexp(const Packet2d& _x) isnumber_mask); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp(const Packet4f& x) +{ + Packet4f res; + res.v4f[0] = pexp(x.v4f[0]); + res.v4f[1] = pexp(x.v4f[1]); + return res; +} + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d psqrt(const Packet2d& x) { return __builtin_s390_vfsqdb(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psqrt(const Packet4f& x) +{ + Packet4f res; + res.v4f[0] = psqrt(x.v4f[0]); + res.v4f[1] = psqrt(x.v4f[1]); + return res; +} + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d prsqrt(const Packet2d& x) { // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. return pset1(1.0) / psqrt(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt(const Packet4f& x) { + Packet4f res; + res.v4f[0] = prsqrt(x.v4f[0]); + res.v4f[1] = prsqrt(x.v4f[1]); + return res; +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 5a7226be6..e2deb25c8 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -28,9 +28,8 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD #endif -// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif typedef __vector int Packet4i; @@ -42,6 +41,10 @@ typedef __vector double Packet2d; typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; +typedef struct { + Packet2d v4f[2]; +} Packet4f; + typedef union { int32_t i[4]; uint32_t ui[4]; @@ -88,6 +91,7 @@ static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; +static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; @@ -132,13 +136,11 @@ template<> struct packet_traits : default_packet_traits typedef Packet4i type; typedef Packet4i half; enum { - // FIXME check the Has* Vectorizable = 1, AlignedOnScalar = 1, size = 4, HasHalfPacket = 0, - // FIXME check the Has* HasAdd = 1, HasSub = 1, HasMul = 1, @@ -147,6 +149,37 @@ template<> struct packet_traits : default_packet_traits }; }; +template<> struct packet_traits : default_packet_traits +{ + typedef Packet4f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasNegate = 1, + HasBlend = 1 + }; +}; + template<> struct packet_traits : default_packet_traits { typedef Packet2d type; @@ -157,7 +190,6 @@ template<> struct packet_traits : default_packet_traits size=2, HasHalfPacket = 1, - // FIXME check the Has* HasAdd = 1, HasSub = 1, HasMul = 1, @@ -180,8 +212,12 @@ template<> struct packet_traits : default_packet_traits }; template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +/* Forward declaration */ +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel); + inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) { Packet vt; @@ -222,6 +258,32 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) return s; } +/* Helper function to simulate a vec_splat_packet4f + */ +template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) +{ + Packet4f splat; + switch (element) { + case 0: + splat.v4f[0] = vec_splat(from.v4f[0], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 1: + splat.v4f[0] = vec_splat(from.v4f[0], 1); + splat.v4f[1] = splat.v4f[0]; + break; + case 2: + splat.v4f[0] = vec_splat(from.v4f[1], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 3: + splat.v4f[0] = vec_splat(from.v4f[1], 1); + splat.v4f[1] = splat.v4f[0]; + break; + } + return splat; +} + template struct palign_impl { @@ -238,6 +300,31 @@ struct palign_impl } }; +/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double + */ +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + switch (Offset % 4) { + case 1: + first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8); + first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8); + break; + case 2: + first.v4f[0] = first.v4f[1]; + first.v4f[1] = second.v4f[0]; + break; + case 3: + first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8); + first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8); + break; + } + } +}; + + template struct palign_impl { @@ -257,6 +344,16 @@ template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) return vfrom->v4i; } +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet4f vfrom; + vfrom.v4f[0] = vec_ld2f(&from[0]); + vfrom.v4f[1] = vec_ld2f(&from[2]); + return vfrom; +} + template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { // FIXME: No intrinsic yet @@ -275,6 +372,15 @@ template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& f vto->v4i = from; } +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + vec_st2f(from.v4f[0], &to[0]); + vec_st2f(from.v4f[1], &to[2]); +} + + template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { // FIXME: No intrinsic yet @@ -288,10 +394,16 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return vec_splats(from); } - template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vec_splats(from); } +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) +{ + Packet4f to; + to.v4f[0] = pset1(static_cast(from)); + to.v4f[1] = to.v4f[0]; + return to; +} template<> EIGEN_STRONG_INLINE void pbroadcast4(const int *a, @@ -304,6 +416,17 @@ pbroadcast4(const int *a, a3 = vec_splat(a3, 3); } +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat_packet4f<0>(a3); + a1 = vec_splat_packet4f<1>(a3); + a2 = vec_splat_packet4f<2>(a3); + a3 = vec_splat_packet4f<3>(a3); +} + template<> EIGEN_STRONG_INLINE void pbroadcast4(const double *a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) @@ -326,6 +449,16 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* f return pload(ai); } +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload(ai); +} + template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { double EIGEN_ALIGN16 af[2]; @@ -344,6 +477,16 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const to[3*stride] = ai[3]; } +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + pstore((float *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { double EIGEN_ALIGN16 af[2]; @@ -353,52 +496,160 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, } template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return (a + b); } +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] + b.v4f[0]; + c.v4f[1] = a.v4f[1] + b.v4f[1]; + return c; +} template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return (a + b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return (a - b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] - b.v4f[0]; + c.v4f[1] = a.v4f[1] - b.v4f[1]; + return c; +} template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return (a - b); } template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return (a * b); } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] * b.v4f[0]; + c.v4f[1] = a.v4f[1] * b.v4f[1]; + return c; +} template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return (a * b); } template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { return (a / b); } +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] / b.v4f[0]; + c.v4f[1] = a.v4f[1] / b.v4f[1]; + return c; +} template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return (a / b); } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ + Packet4f c; + c.v4f[0] = -a.v4f[0]; + c.v4f[1] = -a.v4f[1]; + return c; +} template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a, b), c); } +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ + Packet4f res; + res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]); + res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return padd(pset1(a), p4i_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return padd(pset1(a), p2d_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmin(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmin(a.v4f[1], b.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmax(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmax(a.v4f[1], b.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return pand(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]); + res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_round(a.v4f[0]); + res.v4f[1] = vec_round(a.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_ceil(a.v4f[0]); + res.v4f[1] = vec_ceil(a.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_floor(a.v4f[0]); + res.v4f[1] = vec_floor(a.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { return pload(from); } template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { return pload(from); } @@ -408,6 +659,14 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) return vec_perm(p, p, p16uc_DUPLICATE32_HI); } +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p = pload(from); + p.v4f[1] = vec_splat(p.v4f[0], 1); + p.v4f[0] = vec_splat(p.v4f[0], 0); + return p; +} + template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { Packet2d p = pload(from); @@ -415,12 +674,15 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) @@ -433,8 +695,23 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); } -template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + Packet4f rev; + rev.v4f[0] = preverse(a.v4f[1]); + rev.v4f[1] = preverse(a.v4f[0]); + return rev; +} + +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = pabs(a.v4f[0]); + res.v4f[1] = pabs(a.v4f[1]); + return res; +} template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { @@ -453,6 +730,13 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) sum = padd(a, b); return pfirst(sum); } +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet2d sum; + sum = padd(a.v4f[0], a.v4f[1]); + double first = predux(sum); + return static_cast(first); +} template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) { @@ -493,6 +777,21 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) return sum; } +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + PacketBlock transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + transpose.packet[2] = vecs[2]; + transpose.packet[3] = vecs[3]; + ptranspose(transpose); + + Packet4f sum = padd(transpose.packet[0], transpose.packet[1]); + sum = padd(sum, transpose.packet[2]); + sum = padd(sum, transpose.packet[3]); + return sum; +} + // Other reduction functions: // mul template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) @@ -507,6 +806,12 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + // Return predux_mul of the subvectors product + return static_cast(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); +} + // min template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { @@ -521,6 +826,14 @@ template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) return pfirst(pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet2d b, res; + b = pmin(a.v4f[0], a.v4f[1]); + res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} + // max template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { @@ -536,6 +849,14 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet2d b, res; + b = pmax(a.v4f[0], a.v4f[1]); + res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} + EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); @@ -556,12 +877,61 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1] = t1; } +/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one + */ +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + PacketBlock t0,t1,t2,t3; + // copy top-left 2x2 Packet2d block + t0.packet[0] = kernel.packet[0].v4f[0]; + t0.packet[1] = kernel.packet[1].v4f[0]; + + // copy top-right 2x2 Packet2d block + t1.packet[0] = kernel.packet[0].v4f[1]; + t1.packet[1] = kernel.packet[1].v4f[1]; + + // copy bottom-left 2x2 Packet2d block + t2.packet[0] = kernel.packet[2].v4f[0]; + t2.packet[1] = kernel.packet[3].v4f[0]; + + // copy bottom-right 2x2 Packet2d block + t3.packet[0] = kernel.packet[2].v4f[1]; + t3.packet[1] = kernel.packet[3].v4f[1]; + + // Transpose all 2x2 blocks + ptranspose(t0); + ptranspose(t1); + ptranspose(t2); + ptranspose(t3); + + // Copy back transposed blocks, but exchange t1 and t2 due to transposition + kernel.packet[0].v4f[0] = t0.packet[0]; + kernel.packet[0].v4f[1] = t2.packet[0]; + kernel.packet[1].v4f[0] = t0.packet[1]; + kernel.packet[1].v4f[1] = t2.packet[1]; + kernel.packet[2].v4f[0] = t1.packet[0]; + kernel.packet[2].v4f[1] = t3.packet[0]; + kernel.packet[3].v4f[0] = t1.packet[1]; + kernel.packet[3].v4f[1] = t3.packet[1]; +} + template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); return vec_sel(elsePacket, thenPacket, mask); } +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] }; + Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] }; + Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast(p2l_ONE)); + Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast(p2l_ONE)); + Packet4f result; + result.v4f[0] = vec_sel(elsePacket.v4f[0], thenPacket.v4f[0], mask_hi); + result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo); + return result; +} + template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); diff --git a/test/packetmath.cpp b/test/packetmath.cpp index c18d73496..7821a1738 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -591,7 +591,7 @@ template void packetmath_scatter_gather() int stride = internal::random(1,20); EIGEN_ALIGN_MAX Scalar buffer[PacketSize*20]; - memset(buffer, 0, 20*sizeof(Packet)); + memset(buffer, 0, 20*PacketSize*sizeof(Scalar)); Packet packet = internal::pload(data1); internal::pscatter(buffer, packet, stride); From e1e71ca4e49a504f638ae9fe449174425f565196 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Sun, 6 Aug 2017 19:53:18 -0400 Subject: [PATCH 02/19] initial support for z14 --- Eigen/src/Core/arch/ZVector/PacketMath.h | 809 ++++++++++++++--------- 1 file changed, 514 insertions(+), 295 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 57b01fc63..2e5f5774d 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -41,9 +41,14 @@ typedef __vector double Packet2d; typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; +// Z14 has builtin support for float vectors +#if ~defined(__ARCH__) || (defined(__ARCH__) && __ARCH < 14) typedef struct { Packet2d v4f[2]; } Packet4f; +#else +typedef __vector float Packet4f; +#endif typedef union { int32_t i[4]; @@ -51,11 +56,15 @@ typedef union { int64_t l[2]; uint64_t ul[2]; double d[2]; + float f[4]; Packet4i v4i; Packet4ui v4ui; Packet2l v2l; Packet2ul v2ul; Packet2d v2d; +#if ~defined(__ARCH__) || (defined(__ARCH__) && __ARCH >= 14) + Packet4f v4f; +#endif } Packet; // We don't want to write the same code all the time, but we need to reuse the constants @@ -258,32 +267,6 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) return s; } -/* Helper function to simulate a vec_splat_packet4f - */ -template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) -{ - Packet4f splat; - switch (element) { - case 0: - splat.v4f[0] = vec_splat(from.v4f[0], 0); - splat.v4f[1] = splat.v4f[0]; - break; - case 1: - splat.v4f[0] = vec_splat(from.v4f[0], 1); - splat.v4f[1] = splat.v4f[0]; - break; - case 2: - splat.v4f[0] = vec_splat(from.v4f[1], 0); - splat.v4f[1] = splat.v4f[0]; - break; - case 3: - splat.v4f[0] = vec_splat(from.v4f[1], 1); - splat.v4f[1] = splat.v4f[0]; - break; - } - return splat; -} - template struct palign_impl { @@ -300,31 +283,6 @@ struct palign_impl } }; -/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double - */ -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) - { - switch (Offset % 4) { - case 1: - first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8); - first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8); - break; - case 2: - first.v4f[0] = first.v4f[1]; - first.v4f[1] = second.v4f[0]; - break; - case 3: - first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8); - first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8); - break; - } - } -}; - - template struct palign_impl { @@ -344,16 +302,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) return vfrom->v4i; } -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_LOAD - Packet4f vfrom; - vfrom.v4f[0] = vec_ld2f(&from[0]); - vfrom.v4f[1] = vec_ld2f(&from[2]); - return vfrom; -} - template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { // FIXME: No intrinsic yet @@ -372,15 +320,6 @@ template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& f vto->v4i = from; } -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_STORE - vec_st2f(from.v4f[0], &to[0]); - vec_st2f(from.v4f[1], &to[2]); -} - - template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { // FIXME: No intrinsic yet @@ -397,13 +336,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vec_splats(from); } -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) -{ - Packet4f to; - to.v4f[0] = pset1(static_cast(from)); - to.v4f[1] = to.v4f[0]; - return to; -} template<> EIGEN_STRONG_INLINE void pbroadcast4(const int *a, @@ -416,17 +348,6 @@ pbroadcast4(const int *a, a3 = vec_splat(a3, 3); } -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const float *a, - Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) -{ - a3 = pload(a); - a0 = vec_splat_packet4f<0>(a3); - a1 = vec_splat_packet4f<1>(a3); - a2 = vec_splat_packet4f<2>(a3); - a3 = vec_splat_packet4f<3>(a3); -} - template<> EIGEN_STRONG_INLINE void pbroadcast4(const double *a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) @@ -449,16 +370,6 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* f return pload(ai); } -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) -{ - float EIGEN_ALIGN16 ai[4]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - return pload(ai); -} - template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { double EIGEN_ALIGN16 af[2]; @@ -477,16 +388,6 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const to[3*stride] = ai[3]; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) -{ - float EIGEN_ALIGN16 ai[4]; - pstore((float *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; -} - template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { double EIGEN_ALIGN16 af[2]; @@ -496,160 +397,52 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, } template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return (a + b); } -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] + b.v4f[0]; - c.v4f[1] = a.v4f[1] + b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return (a + b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return (a - b); } -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] - b.v4f[0]; - c.v4f[1] = a.v4f[1] - b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return (a - b); } template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return (a * b); } -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] * b.v4f[0]; - c.v4f[1] = a.v4f[1] * b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return (a * b); } template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { return (a / b); } -template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) -{ - Packet4f c; - c.v4f[0] = a.v4f[0] / b.v4f[0]; - c.v4f[1] = a.v4f[1] / b.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return (a / b); } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); } -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) -{ - Packet4f c; - c.v4f[0] = -a.v4f[0]; - c.v4f[1] = -a.v4f[1]; - return c; -} template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a, b), c); } -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) -{ - Packet4f res; - res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]); - res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return padd(pset1(a), p4i_COUNTDOWN); } -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return padd(pset1(a), p2d_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pmin(a.v4f[0], b.v4f[0]); - res.v4f[1] = pmin(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pmax(a.v4f[0], b.v4f[0]); - res.v4f[1] = pmax(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pand(a.v4f[0], b.v4f[0]); - res.v4f[1] = pand(a.v4f[1], b.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return pand(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) -{ - Packet4f res; - res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]); - res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]); - return res; -} -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_round(a.v4f[0]); - res.v4f[1] = vec_round(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } -template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_ceil(a.v4f[0]); - res.v4f[1] = vec_ceil(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } -template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = vec_floor(a.v4f[0]); - res.v4f[1] = vec_floor(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { return pload(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { return pload(from); } template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { return pload(from); } @@ -659,14 +452,6 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) return vec_perm(p, p, p16uc_DUPLICATE32_HI); } -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) -{ - Packet4f p = pload(from); - p.v4f[1] = vec_splat(p.v4f[0], 1); - p.v4f[0] = vec_splat(p.v4f[0], 0); - return p; -} - template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { Packet2d p = pload(from); @@ -674,15 +459,12 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { pstore(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) @@ -695,23 +477,8 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) -{ - Packet4f rev; - rev.v4f[0] = preverse(a.v4f[1]); - rev.v4f[1] = preverse(a.v4f[0]); - return rev; -} - template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) -{ - Packet4f res; - res.v4f[0] = pabs(a.v4f[0]); - res.v4f[1] = pabs(a.v4f[1]); - return res; -} template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { @@ -730,13 +497,6 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) sum = padd(a, b); return pfirst(sum); } -template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) -{ - Packet2d sum; - sum = padd(a.v4f[0], a.v4f[1]); - double first = predux(sum); - return static_cast(first); -} template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) { @@ -777,21 +537,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) return sum; } -template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) -{ - PacketBlock transpose; - transpose.packet[0] = vecs[0]; - transpose.packet[1] = vecs[1]; - transpose.packet[2] = vecs[2]; - transpose.packet[3] = vecs[3]; - ptranspose(transpose); - - Packet4f sum = padd(transpose.packet[0], transpose.packet[1]); - sum = padd(sum, transpose.packet[2]); - sum = padd(sum, transpose.packet[3]); - return sum; -} - // Other reduction functions: // mul template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) @@ -806,12 +551,6 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ - // Return predux_mul of the subvectors product - return static_cast(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); -} - // min template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { @@ -826,14 +565,6 @@ template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) return pfirst(pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) -{ - Packet2d b, res; - b = pmin(a.v4f[0], a.v4f[1]); - res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); - return static_cast(pfirst(res)); -} - // max template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { @@ -849,14 +580,6 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) -{ - Packet2d b, res; - b = pmax(a.v4f[0], a.v4f[1]); - res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); - return static_cast(pfirst(res)); -} - EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); @@ -877,6 +600,321 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1] = t1; } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + + +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + +/* z13 has no vector float support so we emulate that with double + z14 has proper vector float support. +*/ +#if ~defined(__ARCH__) || (defined(__ARCH__) && __ARCH < 14) +/* Helper function to simulate a vec_splat_packet4f + */ +template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) +{ + Packet4f splat; + switch (element) { + case 0: + splat.v4f[0] = vec_splat(from.v4f[0], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 1: + splat.v4f[0] = vec_splat(from.v4f[0], 1); + splat.v4f[1] = splat.v4f[0]; + break; + case 2: + splat.v4f[0] = vec_splat(from.v4f[1], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 3: + splat.v4f[0] = vec_splat(from.v4f[1], 1); + splat.v4f[1] = splat.v4f[0]; + break; + } + return splat; +} + +/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double + */ +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + switch (Offset % 4) { + case 1: + first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8); + first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8); + break; + case 2: + first.v4f[0] = first.v4f[1]; + first.v4f[1] = second.v4f[0]; + break; + case 3: + first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8); + first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8); + break; + } + } +}; + +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet4f vfrom; + vfrom.v4f[0] = vec_ld2f(&from[0]); + vfrom.v4f[1] = vec_ld2f(&from[2]); + return vfrom; +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + vec_st2f(from.v4f[0], &to[0]); + vec_st2f(from.v4f[1], &to[2]); +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) +{ + Packet4f to; + to.v4f[0] = pset1(static_cast(from)); + to.v4f[1] = to.v4f[0]; + return to; +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat_packet4f<0>(a3); + a1 = vec_splat_packet4f<1>(a3); + a2 = vec_splat_packet4f<2>(a3); + a3 = vec_splat_packet4f<3>(a3); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload(ai); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + pstore((float *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] + b.v4f[0]; + c.v4f[1] = a.v4f[1] + b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] - b.v4f[0]; + c.v4f[1] = a.v4f[1] - b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] * b.v4f[0]; + c.v4f[1] = a.v4f[1] * b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] / b.v4f[0]; + c.v4f[1] = a.v4f[1] / b.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ + Packet4f c; + c.v4f[0] = -a.v4f[0]; + c.v4f[1] = -a.v4f[1]; + return c; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ + Packet4f res; + res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]); + res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmin(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmin(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmax(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmax(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]); + res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_round(a.v4f[0]); + res.v4f[1] = vec_round(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_ceil(a.v4f[0]); + res.v4f[1] = vec_ceil(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_floor(a.v4f[0]); + res.v4f[1] = vec_floor(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p = pload(from); + p.v4f[1] = vec_splat(p.v4f[0], 1); + p.v4f[0] = vec_splat(p.v4f[0], 0); + return p; +} + +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + Packet4f rev; + rev.v4f[0] = preverse(a.v4f[1]); + rev.v4f[1] = preverse(a.v4f[0]); + return rev; +} + +template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = pabs(a.v4f[0]); + res.v4f[1] = pabs(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) +{ + Packet2d sum; + sum = padd(a.v4f[0], a.v4f[1]); + double first = predux(sum); + return static_cast(first); +} + +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + PacketBlock transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + transpose.packet[2] = vecs[2]; + transpose.packet[3] = vecs[3]; + ptranspose(transpose); + + Packet4f sum = padd(transpose.packet[0], transpose.packet[1]); + sum = padd(sum, transpose.packet[2]); + sum = padd(sum, transpose.packet[3]); + return sum; +} + +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + // Return predux_mul of the subvectors product + return static_cast(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); +} + +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet2d b, res; + b = pmin(a.v4f[0], a.v4f[1]); + res = pmin(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} + +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet2d b, res; + b = pmax(a.v4f[0], a.v4f[1]); + res = pmax(b, reinterpret_cast(vec_sld(reinterpret_cast(b), reinterpret_cast(b), 8))); + return static_cast(pfirst(res)); +} + /* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one */ EIGEN_DEVICE_FUNC inline void @@ -915,12 +953,6 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3].v4f[1] = t3.packet[1]; } -template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; - Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); - return vec_sel(elsePacket, thenPacket, mask); -} - template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] }; Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] }; @@ -931,13 +963,200 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo); return result; } +#else +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } + } +}; -template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { - Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v4f; +} + +template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v4f = from; +} + +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) +{ + return vec_splats(from); +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + af[2] = from[2*stride]; + af[3] = from[3*stride]; + return pload(af); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + pstore((float*)af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; + to[2*stride] = af[2]; + to[3*stride] = af[3]; +} + +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return (a + b); } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return (a - b); } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return (a * b); } +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return (a / b); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return (-a); } +template<> EIGEN_STRONG_INLINE Packet4f pconj (const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pmadd (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); } +template<> EIGEN_STRONG_INLINE Packet4f plset (const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4f pmin (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pand (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f por (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pround (const Packet4f& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet4f pceil (const Packet4f& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor (const Packet4f& a) { return vec_floor(a); } +template<> EIGEN_STRONG_INLINE Packet4f pabs (const Packet4f& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu (const float* from) { return pload(from); } + +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +{ + Packet4f p = pload(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); +} + +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); +} + +template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +{ + Packet4f b, sum; + b = vec_sld(a, a, 8); + sum = padd(a, b); + b = vec_sld(sum, sum, 4); + sum = padd(sum, b); + return pfirst(sum); +} + +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + Packet4f v[4], sum[4]; + + // It's easier and faster to transpose then add as columns + // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation + // Do the transpose, first set of moves + v[0] = vec_mergeh(vecs[0], vecs[2]); + v[1] = vec_mergel(vecs[0], vecs[2]); + v[2] = vec_mergeh(vecs[1], vecs[3]); + v[3] = vec_mergel(vecs[1], vecs[3]); + // Get the resulting vectors + sum[0] = vec_mergeh(v[0], v[2]); + sum[1] = vec_mergel(v[0], v[2]); + sum[2] = vec_mergeh(v[1], v[3]); + sum[3] = vec_mergel(v[1], v[3]); + + // Now do the summation: + // Lines 0+1 + sum[0] = padd(sum[0], sum[1]); + // Lines 2+3 + sum[1] = padd(sum[2], sum[3]); + // Add the results + sum[0] = padd(sum[0], sum[1]); + + return sum[0]; +} + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) +{ + return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); +} + +// min +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + Packet4f b, res; + b = pmin(a, vec_sld(a, a, 8)); + res = pmin(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +// max +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + Packet4f b, res; + b = pmax(a, vec_sld(a, a, 8)); + res = pmax(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); return vec_sel(elsePacket, thenPacket, mask); } +#endif + +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } + } // end namespace internal } // end namespace Eigen From 0c9ad2f52526bd7f07ba41161e68bee7231c05c7 Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Thu, 14 Sep 2017 19:23:38 +0200 Subject: [PATCH 03/19] std::integral_constant is not C++03 compatible --- Eigen/src/Core/util/XprHelper.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index c88f1dbb9..10328be0d 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -36,14 +36,16 @@ inline IndexDest convert_index(const IndexSrc& idx) { // true if T can be considered as an integral index (i.e., and integral type or enum) template struct is_valid_index_type - : std::integral_constant::value || std::is_enum::value #else // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index. internal::is_convertible::value #endif -> {}; + }; +}; // promote_scalar_arg is an helper used in operation between an expression and a scalar, like: // expression * scalar From 23f8b00bc884fe94e94ea273538e5546a4160e4f Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Thu, 14 Sep 2017 19:26:03 +0200 Subject: [PATCH 04/19] clang provides __has_feature(is_enum) (but not ) in C++03 mode --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index caedb0b3a..f5b071a9c 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -412,7 +412,7 @@ // Does the compiler support type_trais? #ifndef EIGEN_HAS_TYPE_TRAITS -#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || __has_feature(is_enum) || EIGEN_COMP_MSVC >= 1700) +#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) #define EIGEN_HAS_TYPE_TRAITS 1 #define EIGEN_INCLUDE_TYPE_TRAITS #else From 7ad07fc6f2e1dd74554ba576367883c4236c6b98 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Sep 2017 10:22:00 +0200 Subject: [PATCH 05/19] Update documentation for aligned_allocator --- Eigen/src/Core/util/Memory.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 7d9053496..f48d115f4 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -696,7 +696,15 @@ template void swap(scoped_array &a,scoped_array &b) /** \class aligned_allocator * \ingroup Core_Module * -* \brief STL compatible allocator to use with with 16 byte aligned types +* \brief STL compatible allocator to use with types requiring a non standrad alignment. +* +* The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd. +* By default, it will thus provide at least 16 bytes alignment and more in following cases: +* - 32 bytes alignment if AVX is enabled. +* - 64 bytes alignment if AVX512 is enabled. +* +* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented +* \link TopicPreprocessorDirectivesPerformance there \endlink. * * Example: * \code From f92567fecc02f7653c5974ed9b162e49a813dc0c Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Wed, 20 Sep 2017 10:22:23 +0200 Subject: [PATCH 06/19] Add link to a useful example. --- Eigen/src/plugins/IndexedViewMethods.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index 215bd0530..a7ec63adf 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -248,6 +248,8 @@ operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST * * For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter. * + * See also this question and its answer for an example of how to duplicate coefficients. + * * \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index) */ template From 8579195169ba046b980b01769edb581b281e0b8a Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 22 Sep 2017 09:23:24 +0200 Subject: [PATCH 07/19] bug #1468 (1/2) : add missing std:: to memcpy --- Eigen/QtAlignedMalloc | 2 +- Eigen/src/Core/util/Memory.h | 2 +- Eigen/src/SparseCore/AmbiVector.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/QtAlignedMalloc b/Eigen/QtAlignedMalloc index c6571f129..4f07df02a 100644 --- a/Eigen/QtAlignedMalloc +++ b/Eigen/QtAlignedMalloc @@ -27,7 +27,7 @@ void qFree(void *ptr) void *qRealloc(void *ptr, std::size_t size) { void* newPtr = Eigen::internal::aligned_malloc(size); - memcpy(newPtr, ptr, size); + std::memcpy(newPtr, ptr, size); Eigen::internal::aligned_free(ptr); return newPtr; } diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index f48d115f4..c455f92a1 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -493,7 +493,7 @@ template struct smart_copy_helper { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); - memcpy(target, start, size); + std::memcpy(target, start, size); } }; diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h index 8a5cc91f2..e0295f2af 100644 --- a/Eigen/src/SparseCore/AmbiVector.h +++ b/Eigen/src/SparseCore/AmbiVector.h @@ -94,7 +94,7 @@ class AmbiVector Index allocSize = m_allocatedElements * sizeof(ListEl); allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar); Scalar* newBuffer = new Scalar[allocSize]; - memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); + std::memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); delete[] m_buffer; m_buffer = newBuffer; } From 0e85a677e36956f13a3d85047d088d773b39b69b Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Tue, 26 Sep 2017 10:53:33 +0200 Subject: [PATCH 08/19] bug #1472: fix warning --- Eigen/src/Core/Map.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 7ca6a9280..c437f1a92 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -114,7 +114,7 @@ template class Ma inline Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() - : internal::traits::OuterStrideAtCompileTime != Dynamic ? internal::traits::OuterStrideAtCompileTime + : internal::traits::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits::OuterStrideAtCompileTime) : IsVectorAtCompileTime ? (this->size() * innerStride()) : int(Flags)&RowMajorBit ? (this->cols() * innerStride()) : (this->rows() * innerStride()); From bc30305d2989443901c8ebe27045c9e996d32a40 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 9 Oct 2017 16:55:10 -0400 Subject: [PATCH 09/19] complete z14 port --- Eigen/src/Core/arch/ZVector/Complex.h | 423 ++++++++++++++------ Eigen/src/Core/arch/ZVector/MathFunctions.h | 15 +- Eigen/src/Core/arch/ZVector/PacketMath.h | 42 +- 3 files changed, 329 insertions(+), 151 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index 1bfb73397..3c72968e6 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -15,6 +15,10 @@ namespace Eigen { namespace internal { +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; +#endif + static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 }; @@ -29,10 +33,14 @@ struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) union { Packet4f v; Packet1cd cd[2]; }; +#else + Packet4f v; +#endif }; template<> struct packet_traits > : default_packet_traits @@ -89,63 +97,27 @@ template<> struct unpacket_traits { typedef std::complex type /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); -template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } +/* complex first */ template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } -template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) -{ - Packet2cf res; - res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); - res.cd[1] = res.cd[0]; - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) -{ - std::complex EIGEN_ALIGN16 af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); -} template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride EIGEN_UNUSED) { return pload(from); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) -{ - std::complex EIGEN_ALIGN16 af[2]; - pstore >((std::complex *) af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; -} template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride EIGEN_UNUSED) { pstore >(to, from); } - -template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } -template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } -template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); } -template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) -{ - Packet2cf res; - res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; - res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; - return res; -} - template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { Packet2d a_re, a_im, v1, v2; @@ -163,27 +135,12 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con return Packet1cd(v1 + v2); } -template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) -{ - Packet2cf res; - res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; - res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; - return res; -} - -template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } -template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } - +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } -template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) @@ -193,61 +150,20 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pac return res; } -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) -{ - std::complex EIGEN_ALIGN16 res[2]; - pstore >(res, a); - - return res[0]; -} template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) -{ - Packet2cf res; - res.cd[0] = a.cd[1]; - res.cd[1] = a.cd[0]; - return res; -} - template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } -template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) -{ - std::complex res; - Packet1cd b = padd(a.cd[0], a.cd[1]); - vec_st2f(b.v, (float*)&res); - return res; -} - template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } -template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) -{ - PacketBlock transpose; - transpose.packet[0] = vecs[0]; - transpose.packet[1] = vecs[1]; - ptranspose(transpose); - - return padd(transpose.packet[0], transpose.packet[1]); -} - template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) -{ - std::complex res; - Packet1cd b = pmul(a.cd[0], a.cd[1]); - vec_st2f(b.v, (float*)&res); - return res; -} - template struct palign_impl { @@ -258,18 +174,6 @@ struct palign_impl } }; -template -struct palign_impl -{ - static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) - { - if (Offset == 1) { - first.cd[0] = first.cd[1]; - first.cd[1] = second.cd[0]; - } - } -}; - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const @@ -303,6 +207,156 @@ template<> struct conj_helper } }; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for AltiVec + Packet1cd res = conj_helper().pmul(a,b); + Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); + return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} + +/* complex follows */ +template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +{ + std::complex EIGEN_ALIGN16 res[2]; + pstore >(res, a); + + return res[0]; +} + + +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + Packet2cf res; + res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); + res.cd[1] = res.cd[0]; + return res; +} +#else +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + Packet2cf res; + if((std::ptrdiff_t(&from) % 16) == 0) + res.v = pload((const float *)&from); + else + res.v = ploadu((const float *)&from); + res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI); + return res; +} +#endif + +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + pstore >((std::complex *) af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } + +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } + + +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) +{ + Packet2cf res; + res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; + res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; + return res; +} + +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + Packet2cf res; + res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; + res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; + return res; +} + +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet2cf res; + res.cd[0] = a.cd[1]; + res.cd[1] = a.cd[0]; + return res; +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = padd(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + PacketBlock transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + ptranspose(transpose); + + return padd(transpose.packet[0], transpose.packet[1]); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = pmul(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset == 1) { + first.cd[0] = first.cd[1]; + first.cd[1] = second.cd[0]; + } + } +}; + template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const @@ -337,15 +391,6 @@ template<> struct conj_helper }; EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) - -template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) -{ - // TODO optimize it for AltiVec - Packet1cd res = conj_helper().pmul(a,b); - Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); - return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); -} template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { @@ -356,11 +401,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con return res; } -EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) -{ - return Packet1cd(preverse(Packet2d(x.v))); -} - EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) { Packet2cf res; @@ -369,13 +409,6 @@ EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) return res; } -EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); - kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); - kernel.packet[0].v = tmp; -} - EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { Packet1cd tmp = kernel.packet[0].cd[1]; @@ -389,6 +422,136 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, con result.v = pblend(ifPacket4, thenPacket.v, elsePacket.v); return result; } +#else +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + Packet4f v1, v2; + + // Permute and multiply the real parts of a and b + v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD); + // Get the imaginary parts of a + v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); + // multiply a_re * b + v1 = vec_madd(v1, b.v, p4f_ZERO); + // multiply a_im * b and get the conjugate result + v2 = vec_madd(v2, b.v, p4f_ZERO); + v2 = reinterpret_cast(pxor(v2, reinterpret_cast(p4ui_CONJ_XOR))); + // permute back to a proper order + v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); + + return Packet2cf(padd(v1, v2)); +} + +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet4f rev_a; + rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2); + return Packet2cf(rev_a); +} + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + Packet4f b; + b = vec_sld(a.v, a.v, 8); + b = padd(a.v, b); + return pfirst(Packet2cf(b)); +} + +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + Packet4f b1, b2; + b1 = vec_sld(vecs[0].v, vecs[1].v, 8); + b2 = vec_sld(vecs[1].v, vecs[0].v, 8); + b2 = vec_sld(b2, b2, 8); + b2 = padd(b1, b2); + + return Packet2cf(b2); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + Packet4f b; + Packet2cf prod; + b = vec_sld(a.v, a.v, 8); + prod = pmul(a, Packet2cf(b)); + + return pfirst(prod); +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset==1) + { + first.v = vec_sld(first.v, second.v, 8); + } + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) + +template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for AltiVec + Packet2cf res = conj_helper().pmul(a, b); + Packet4f s = pmul(b.v, b.v); + return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); +} + +template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x) +{ + return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV)); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} + +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + Packet2cf result; + result.v = reinterpret_cast(pblend(ifPacket, reinterpret_cast(thenPacket.v), reinterpret_cast(elsePacket.v))); + return result; +} +#endif } // end namespace internal diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h index 5c7aa7256..234d82be7 100644 --- a/Eigen/src/Core/arch/ZVector/MathFunctions.h +++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h @@ -96,37 +96,48 @@ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& x) { Packet4f res; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + res = pexp(x); +#else res.v4f[0] = pexp(x.v4f[0]); res.v4f[1] = pexp(x.v4f[1]); +#endif return res; } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d psqrt(const Packet2d& x) { - return __builtin_s390_vfsqdb(x); + return vec_sqrt(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psqrt(const Packet4f& x) { Packet4f res; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + res = vec_sqrt(x); +#else res.v4f[0] = psqrt(x.v4f[0]); res.v4f[1] = psqrt(x.v4f[1]); +#endif return res; } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d prsqrt(const Packet2d& x) { - // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. return pset1(1.0) / psqrt(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& x) { Packet4f res; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + res = pset1(1.0) / psqrt(x); +#else res.v4f[0] = prsqrt(x.v4f[0]); res.v4f[1] = prsqrt(x.v4f[1]); +#endif return res; } diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 2e5f5774d..a3294d318 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -17,7 +17,7 @@ namespace Eigen { namespace internal { #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16 #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD @@ -29,7 +29,7 @@ namespace internal { #endif #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif typedef __vector int Packet4i; @@ -42,12 +42,12 @@ typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; // Z14 has builtin support for float vectors -#if ~defined(__ARCH__) || (defined(__ARCH__) && __ARCH < 14) +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +typedef __vector float Packet4f; +#else typedef struct { Packet2d v4f[2]; } Packet4f; -#else -typedef __vector float Packet4f; #endif typedef union { @@ -62,7 +62,7 @@ typedef union { Packet2l v2l; Packet2ul v2ul; Packet2d v2d; -#if ~defined(__ARCH__) || (defined(__ARCH__) && __ARCH >= 14) +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) Packet4f v4f; #endif } Packet; @@ -89,7 +89,7 @@ typedef union { Packet2l p2l_##NAME = pset1(X) // These constants are endian-agnostic -//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1} static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); @@ -99,6 +99,15 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ + Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) + +static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} +static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000}; +#endif + static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); @@ -129,9 +138,9 @@ static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0 static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; -//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; +static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; -//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC @@ -616,7 +625,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons /* z13 has no vector float support so we emulate that with double z14 has proper vector float support. */ -#if ~defined(__ARCH__) || (defined(__ARCH__) && __ARCH < 14) +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) /* Helper function to simulate a vec_splat_packet4f */ template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) @@ -1041,7 +1050,6 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return (-a); } template<> EIGEN_STRONG_INLINE Packet4f pconj (const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4f pmadd (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); } -template<> EIGEN_STRONG_INLINE Packet4f plset (const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } template<> EIGEN_STRONG_INLINE Packet4f pmin (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pmax (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pand (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } @@ -1052,9 +1060,6 @@ template<> EIGEN_STRONG_INLINE Packet4f pround (const Packet4f& a) { r template<> EIGEN_STRONG_INLINE Packet4f pceil (const Packet4f& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor (const Packet4f& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet4f pabs (const Packet4f& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu (const float* from) { return pload(from); } - -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) @@ -1068,7 +1073,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); } -template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { Packet4f b, sum; b = vec_sld(a, a, 8); @@ -1152,10 +1157,9 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons #endif template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { return pload(from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu (const float* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE Packet4f plset (const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } } // end namespace internal From 374f750ad4708408a1255a98964719fd598b0659 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 9 Oct 2017 16:56:30 -0400 Subject: [PATCH 10/19] eliminate 'enumeral and non-enumeral type in conditional expression' warning --- Eigen/src/Core/Map.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 7ca6a9280..169123824 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -114,7 +114,7 @@ template class Ma inline Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() - : internal::traits::OuterStrideAtCompileTime != Dynamic ? internal::traits::OuterStrideAtCompileTime + : internal::traits::OuterStrideAtCompileTime != Dynamic ? int(internal::traits::OuterStrideAtCompileTime) : IsVectorAtCompileTime ? (this->size() * innerStride()) : int(Flags)&RowMajorBit ? (this->cols() * innerStride()) : (this->rows() * innerStride()); From c2a224648919acb27bc208dcc24797345e3a1353 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 10 Oct 2017 13:38:32 -0400 Subject: [PATCH 11/19] fix predux_mul for z14/float --- Eigen/src/Core/arch/ZVector/PacketMath.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index a3294d318..21612ba91 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -1115,7 +1115,9 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) // mul template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { - return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); + Packet4f prod; + prod = pmul(a, vec_sld(a, a, 8)); + return pfirst(pmul(prod, vec_sld(prod, prod, 4))); } // min From 3dcae2a27ffca563ef0a5b78d9d9d27cb05216a8 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 11 Oct 2017 09:40:45 -0400 Subject: [PATCH 12/19] initial pexp() for 32-bit floats, commented out due to vec_cts() --- Eigen/src/Core/arch/ZVector/PacketMath.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 21612ba91..a72aea851 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -103,6 +103,12 @@ static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) +#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ + Packet4f p4f_##NAME = pset1(X) + +#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ + const Packet4f p4f_##NAME = reinterpret_cast(pset1(X)) + static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000}; @@ -187,7 +193,11 @@ template<> struct packet_traits : default_packet_traits HasSin = 0, HasCos = 0, HasLog = 0, +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) + HasExp = 0, +#else HasExp = 1, +#endif HasSqrt = 1, HasRsqrt = 1, HasRound = 1, From df173f562062843a73454f2eb2479ae1e26dbcdf Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 11 Oct 2017 09:40:49 -0400 Subject: [PATCH 13/19] initial pexp() for 32-bit floats, commented out due to vec_cts() --- Eigen/src/Core/arch/ZVector/MathFunctions.h | 96 +++++++++++++++++++-- 1 file changed, 90 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h index 234d82be7..ff33a975f 100644 --- a/Eigen/src/Core/arch/ZVector/MathFunctions.h +++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h @@ -20,6 +20,50 @@ namespace Eigen { namespace internal { +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); +static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); +static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); +static _EIGEN_DECLARE_CONST_Packet4i(23, 23); + +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); + +/* the smallest non denormalized float number */ +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); + +/* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 +*/ +static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + +static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); +static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); +#endif + static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); @@ -93,16 +137,56 @@ Packet2d pexp(const Packet2d& _x) } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& x) +Packet4f pexp(const Packet4f& _x) { - Packet4f res; #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) - res = pexp(x); +/* + Packet4f x = _x; + + Packet4f tmp, fx; + Packet4i emm0; + + // clamp x + x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); + + // express exp(x) as exp(g + n*log(2)) + fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); + + fx = pfloor(fx); + + tmp = pmul(fx, p4f_cephes_exp_C1); + Packet4f z = pmul(fx, p4f_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + z = pmul(x,x); + + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + + // build 2^n + emm0 = vec_cts(fx, 0); + emm0 = emm0 + p4i_0x7f; + emm0 = emm0 << reinterpret_cast(p4i_23); + + // Altivec's max & min operators just drop silent NaNs. Check NaNs in + // inputs and return them unmodified. + Packet4ui isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); + return vec_sel(_x, pmax(pmul(y, reinterpret_cast(emm0)), _x), + isnumber_mask);*/ + return _x; #else - res.v4f[0] = pexp(x.v4f[0]); - res.v4f[1] = pexp(x.v4f[1]); -#endif + Packet4f res; + res.v4f[0] = pexp(_x.v4f[0]); + res.v4f[1] = pexp(_x.v4f[1]); return res; +#endif } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED From d0b7b9d0d321905776326ce99c5c3ff3d48f4ce7 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 11 Oct 2017 10:17:22 -0400 Subject: [PATCH 14/19] some Packet2cf pmul fixes --- Eigen/src/Core/arch/ZVector/Complex.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index 3c72968e6..f9e3a480a 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -426,21 +426,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, con template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { - Packet4f v1, v2; + Packet4f a_re, a_im, prod, prod_im; // Permute and multiply the real parts of a and b - v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD); + a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD); // Get the imaginary parts of a - v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); - // multiply a_re * b - v1 = vec_madd(v1, b.v, p4f_ZERO); + a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); // multiply a_im * b and get the conjugate result - v2 = vec_madd(v2, b.v, p4f_ZERO); - v2 = reinterpret_cast(pxor(v2, reinterpret_cast(p4ui_CONJ_XOR))); + prod_im = a_im * b.v; + prod_im = pxor(prod_im, reinterpret_cast(p4ui_CONJ_XOR)); // permute back to a proper order - v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); - - return Packet2cf(padd(v1, v2)); + prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV); + + // multiply a_re * b, add prod_im + prod = pmadd(a_re, b.v, prod_im); + + return Packet2cf(prod); } template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) From 380d41fd76f2e8aeb4f8d4b9138515a05c9d70e3 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 11 Oct 2017 10:40:12 -0400 Subject: [PATCH 15/19] added some extra debugging --- Eigen/src/Core/arch/ZVector/Complex.h | 10 ++++++++++ Eigen/src/Core/arch/ZVector/PacketMath.h | 11 +++++++++++ 2 files changed, 21 insertions(+) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index f9e3a480a..cd4316687 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -428,18 +428,28 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con { Packet4f a_re, a_im, prod, prod_im; + std::cout << "a = " << a.v << std::endl; + std::cout << ", b = " << b.v << std::endl; // Permute and multiply the real parts of a and b a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD); + + std::cout << "a_re = " << a_re << std::endl; // Get the imaginary parts of a a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); + std::cout << "a_im = " << a_im << std::endl; + // multiply a_im * b and get the conjugate result prod_im = a_im * b.v; + std::cout << "prod_im = " << prod_im << std::endl; prod_im = pxor(prod_im, reinterpret_cast(p4ui_CONJ_XOR)); + std::cout << "prod_im = " << prod_im << std::endl; // permute back to a proper order prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV); + std::cout << "prod_im = " << prod_im << std::endl; // multiply a_re * b, add prod_im prod = pmadd(a_re, b.v, prod_im); + std::cout << "prod = " << prod << std::endl; return Packet2cf(prod); } diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index a72aea851..0b37f4992 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -286,6 +286,17 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) return s; } +#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) +inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) +{ + Packet vt; + vt.v4f = v; + s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3]; + return s; +} +#endif + + template struct palign_impl { From c4ad358565a1099bc0bdad797c3ca8bd297ea770 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 11 Oct 2017 11:05:29 -0400 Subject: [PATCH 16/19] explicitly set conjugate mask --- Eigen/src/Core/arch/ZVector/Complex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index cd4316687..7504acff9 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -16,7 +16,7 @@ namespace Eigen { namespace internal { #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) -static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; +static Packet4ui p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO); #endif static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; From 98e52cc770ac26fbd29aaa7583443009d7937084 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 12 Oct 2017 15:22:10 -0400 Subject: [PATCH 17/19] rollback 374f750ad4708408a1255a98964719fd598b0659 --- Eigen/src/Core/Map.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 169123824..7ca6a9280 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -114,7 +114,7 @@ template class Ma inline Index outerStride() const { return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() - : internal::traits::OuterStrideAtCompileTime != Dynamic ? int(internal::traits::OuterStrideAtCompileTime) + : internal::traits::OuterStrideAtCompileTime != Dynamic ? internal::traits::OuterStrideAtCompileTime : IsVectorAtCompileTime ? (this->size() * innerStride()) : int(Flags)&RowMajorBit ? (this->cols() * innerStride()) : (this->rows() * innerStride()); From 6c3475f110a017b3872a94a4ef13d0c8b543faca Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 12 Oct 2017 15:34:55 -0400 Subject: [PATCH 18/19] remove debugging --- Eigen/src/Core/arch/ZVector/Complex.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index 7504acff9..95aba428f 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -428,28 +428,20 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con { Packet4f a_re, a_im, prod, prod_im; - std::cout << "a = " << a.v << std::endl; - std::cout << ", b = " << b.v << std::endl; // Permute and multiply the real parts of a and b a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD); - std::cout << "a_re = " << a_re << std::endl; // Get the imaginary parts of a a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); - std::cout << "a_im = " << a_im << std::endl; // multiply a_im * b and get the conjugate result prod_im = a_im * b.v; - std::cout << "prod_im = " << prod_im << std::endl; prod_im = pxor(prod_im, reinterpret_cast(p4ui_CONJ_XOR)); - std::cout << "prod_im = " << prod_im << std::endl; // permute back to a proper order prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV); - std::cout << "prod_im = " << prod_im << std::endl; // multiply a_re * b, add prod_im prod = pmadd(a_re, b.v, prod_im); - std::cout << "prod = " << prod << std::endl; return Packet2cf(prod); } From 0e6e027e91c134104d3595e5bfcd4a7d2a44a6dd Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Thu, 12 Oct 2017 15:38:34 -0400 Subject: [PATCH 19/19] check both z13 and z14 arches --- CMakeLists.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e1648fd1..62bf823c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -271,12 +271,18 @@ if(NOT MSVC) message(STATUS "Enabling NEON in tests/examples") endif() - option(EIGEN_TEST_ZVECTOR "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF) - if(EIGEN_TEST_ZVECTOR) + option(EIGEN_TEST_Z13 "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF) + if(EIGEN_TEST_Z13) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector") message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples") endif() + option(EIGEN_TEST_Z14 "Enable/Disable S390X(zEC14) ZVECTOR in tests/examples" OFF) + if(EIGEN_TEST_Z14) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector") + message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples") + endif() + check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP) if(COMPILER_SUPPORT_OPENMP) option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF)