diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index 3e665730c..47c91ddf7 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -15,7 +15,9 @@ namespace Eigen { namespace internal { -static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; +inline Packet4ui p4ui_CONJ_XOR() { + return vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; +} #ifdef __VSX__ #if defined(_BIG_ENDIAN) static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; @@ -29,8 +31,54 @@ static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (P //---------- float ---------- struct Packet2cf { - EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {} + EIGEN_STRONG_INLINE explicit Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) + { + Packet4f v1, v2; + + // Permute and multiply the real parts of a and b + v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD); + // Get the imaginary parts of a + v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); + // multiply a_re * b + v1 = vec_madd(v1, b.v, p4f_ZERO); + // multiply a_im * b and get the conjugate result + v2 = vec_madd(v2, b.v, p4f_ZERO); + v2 = reinterpret_cast(pxor(v2, reinterpret_cast(p4ui_CONJ_XOR()))); + // permute back to a proper order + v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); + + return Packet2cf(padd(v1, v2)); + } + + EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) { + v = pmul(Packet2cf(*this), b).v; + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const { + return Packet2cf(*this) *= b; + } + + EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) { + v = padd(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const { + return Packet2cf(*this) += b; + } + EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) { + v = psub(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { + return Packet2cf(*this) -= b; + } + EIGEN_STRONG_INLINE Packet2cf operator-(void) const { + return Packet2cf(-v); + } + Packet4f v; }; @@ -82,14 +130,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; pstore >((std::complex *) af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -98,26 +146,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } - -template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) -{ - Packet4f v1, v2; - - // Permute and multiply the real parts of a and b - v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD); - // Get the imaginary parts of a - v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); - // multiply a_re * b - v1 = vec_madd(v1, b.v, p4f_ZERO); - // multiply a_im * b and get the conjugate result - v2 = vec_madd(v2, b.v, p4f_ZERO); - v2 = reinterpret_cast(pxor(v2, reinterpret_cast(p4ui_CONJ_XOR))); - // permute back to a proper order - v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); - - return Packet2cf(padd(v1, v2)); -} +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR()))); } template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v, b.v)); } @@ -128,7 +157,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::co template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - std::complex EIGEN_ALIGN16 res[2]; + EIGEN_ALIGN16 std::complex res[2]; pstore((float *)&res, a.v); return res[0]; @@ -152,7 +181,7 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Packe template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) { Packet4f b1, b2; -#ifdef _BIG_ENDIAN +#ifdef _BIG_ENDIAN b1 = vec_sld(vecs[0].v, vecs[1].v, 8); b2 = vec_sld(vecs[1].v, vecs[0].v, 8); #else @@ -260,6 +289,51 @@ struct Packet1cd { EIGEN_STRONG_INLINE Packet1cd() {} EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) + { + Packet2d a_re, a_im, v1, v2; + + // Permute and multiply the real parts of a and b + a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI); + // Get the imaginary parts of a + a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO); + // multiply a_re * b + v1 = vec_madd(a_re, b.v, p2d_ZERO); + // multiply a_im * b and get the conjugate result + v2 = vec_madd(a_im, b.v, p2d_ZERO); + v2 = reinterpret_cast(vec_sld(reinterpret_cast(v2), reinterpret_cast(v2), 8)); + v2 = pxor(v2, reinterpret_cast(p2ul_CONJ_XOR1)); + + return Packet1cd(padd(v1, v2)); + } + + EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) { + v = pmul(Packet1cd(*this), b).v; + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const { + return Packet1cd(*this) *= b; + } + + EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) { + v = padd(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const { + return Packet1cd(*this) += b; + } + EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) { + v = psub(v, b.v); + return *this; + } + EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const { + return Packet1cd(*this) -= b; + } + EIGEN_STRONG_INLINE Packet1cd operator-(void) const { + return Packet1cd(-v); + } + Packet2d v; }; @@ -296,19 +370,13 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex< template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } -template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index) { - std::complex EIGEN_ALIGN16 af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); + return pload(from); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index) { - std::complex EIGEN_ALIGN16 af[2]; - pstore >(af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; + pstore >(to, from); } template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } @@ -316,24 +384,6 @@ template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, con template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast(p2ul_CONJ_XOR2))); } -template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) -{ - Packet2d a_re, a_im, v1, v2; - - // Permute and multiply the real parts of a and b - a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI); - // Get the imaginary parts of a - a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO); - // multiply a_re * b - v1 = vec_madd(a_re, b.v, p2d_ZERO); - // multiply a_im * b and get the conjugate result - v2 = vec_madd(a_im, b.v, p2d_ZERO); - v2 = reinterpret_cast(vec_sld(reinterpret_cast(v2), reinterpret_cast(v2), 8)); - v2 = pxor(v2, reinterpret_cast(p2ul_CONJ_XOR1)); - - return Packet1cd(padd(v1, v2)); -} - template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); } @@ -345,7 +395,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::c template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - std::complex EIGEN_ALIGN16 res[2]; + EIGEN_ALIGN16 std::complex res[2]; pstore >(res, a); return res[0]; diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 08a27d153..27f61a49b 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -22,10 +22,6 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 @@ -40,9 +36,8 @@ typedef __vector unsigned char Packet16uc; // We don't want to write the same code all the time, but we need to reuse the constants // and it doesn't really work to declare them global, so we define macros instead - #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ - Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) + Packet4f p4f_##NAME = {X, X, X, X} #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ Packet4i p4i_##NAME = vec_splat_s32(X) @@ -64,7 +59,7 @@ typedef __vector unsigned char Packet16uc; #define DST_CHAN 1 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) - +#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits::type // These constants are endian-agnostic static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} @@ -77,21 +72,15 @@ static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui) static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} #endif +static Packet4ui p4ui_SIGN = {0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u}; +static Packet4ui p4ui_PREV0DOT5 = {0x3EFFFFFFu, 0x3EFFFFFFu, 0x3EFFFFFFu, 0x3EFFFFFFu}; + static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; -// Mask alignment -#ifdef __PPC64__ -#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 -#else -#define _EIGEN_MASK_ALIGNMENT 0xfffffff0 -#endif - -#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) - // Handle endianness properly while loading constants // Define global static constants: #ifdef _BIG_ENDIAN @@ -235,112 +224,131 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) return s; } -// Need to define them first or we get specialization after instantiation errors -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +template +EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from) { + // some versions of GCC throw "unused-but-set-parameter". + // ignoring these warnings for now. + EIGEN_UNUSED_VARIABLE(from); EIGEN_DEBUG_ALIGNED_LOAD #ifdef __VSX__ - return vec_vsx_ld(0, from); + return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); #else return vec_ld(0, from); #endif } +// Need to define them first or we get specialization after instantiation errors +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) +{ + return pload_common(from); +} + template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { - EIGEN_DEBUG_ALIGNED_LOAD + return pload_common(from); +} + +template +EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){ + // some versions of GCC throw "unused-but-set-parameter" (float *to). + // ignoring these warnings for now. + EIGEN_UNUSED_VARIABLE(to); + EIGEN_DEBUG_ALIGNED_STORE #ifdef __VSX__ - return vec_vsx_ld(0, from); + vec_xst(from, 0, to); #else - return vec_ld(0, from); + vec_st(from, 0, to); #endif } template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { - EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ - vec_vsx_st(from, 0, to); -#else - vec_st(from, 0, to); -#endif + pstore_common(to, from); } template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { - EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ - vec_vsx_st(from, 0, to); -#else - vec_st(from, 0, to); -#endif + pstore_common(to, from); +} + +template +EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from) +{ + Packet v = {from, from, from, from}; + return v; } template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { - Packet4f v = {from, from, from, from}; - return v; + return pset1_size4(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { - Packet4i v = {from, from, from, from}; - return v; + return pset1_size4(from); } -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const float *a, - Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) + +template EIGEN_STRONG_INLINE void +pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) { - a3 = pload(a); + a3 = pload(a); a0 = vec_splat(a3, 0); a1 = vec_splat(a3, 1); a2 = vec_splat(a3, 2); a3 = vec_splat(a3, 3); } + +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + pbroadcast4_common(a, a0, a1, a2, a3); +} template<> EIGEN_STRONG_INLINE void pbroadcast4(const int *a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) { - a3 = pload(a); - a0 = vec_splat(a3, 0); - a1 = vec_splat(a3, 1); - a2 = vec_splat(a3, 2); - a3 = vec_splat(a3, 3); + pbroadcast4_common(a, a0, a1, a2, a3); +} + +template EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; + a[0] = from[0*stride]; + a[1] = from[1*stride]; + a[2] = from[2*stride]; + a[3] = from[3*stride]; + return pload(a); } template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - float EIGEN_ALIGN16 af[4]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - af[2] = from[2*stride]; - af[3] = from[3*stride]; - return pload(af); + return pgather_common(from, stride); } + template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { - int EIGEN_ALIGN16 ai[4]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - return pload(ai); + return pgather_common(from, stride); } + +template EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) +{ + EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; + pstore<__UNPACK_TYPE__(Packet)>(a, from); + to[0*stride] = a[0]; + to[1*stride] = a[1]; + to[2*stride] = a[2]; + to[3*stride] = a[3]; +} + template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - float EIGEN_ALIGN16 af[4]; - pstore(af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; - to[2*stride] = af[2]; - to[3*stride] = af[3]; + pscatter_size4(to, from, stride); } + template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) { - int EIGEN_ALIGN16 ai[4]; - pstore((int *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; + pscatter_size4(to, from, stride); } template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return pset1(a) + p4f_COUNTDOWN; } @@ -424,66 +432,71 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) +{ + Packet4f t = vec_add(reinterpret_cast(vec_or(vec_and(reinterpret_cast(a), p4ui_SIGN), p4ui_PREV0DOT5)), a); + Packet4f res; + +#ifdef __VSX__ + __asm__("xvrspiz %x0, %x1\n\t" + : "=&wa" (res) + : "wa" (t)); +#else + __asm__("vrfiz %0, %1\n\t" + : "=v" (res) + : "v" (t)); +#endif + + return res; +} template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return vec_floor(a); } -#ifdef _BIG_ENDIAN -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) +template EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from) { - EIGEN_DEBUG_ALIGNED_LOAD - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data - -} -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data -} + EIGEN_DEBUG_UNALIGNED_LOAD +#ifdef __VSX__ + return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); #else -// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD - return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from)); + Packet16uc mask = vec_lvsl(0, from); // create the permute mask + Packet16uc MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword + Packet16uc LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword + //TODO: Add static_cast here + return (Packet) vec_perm(MSQ, LSQ, mask); // align the data +#endif } + template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { - EIGEN_DEBUG_UNALIGNED_LOAD - return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); + return ploadu_common(from); +} +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) +{ + return ploadu_common(from); } -#endif +template EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from) +{ + Packet p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); +} template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { - Packet4f p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE32_HI); + return ploaddup_common(from); } template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) { - Packet4i p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE32_HI); + return ploaddup_common(from); } -#ifdef _BIG_ENDIAN -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +template EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from) { EIGEN_DEBUG_UNALIGNED_STORE +#ifdef __VSX__ + vec_xst(from, 0, to); +#else // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html // Warning: not thread safe! Packet16uc MSQ, LSQ, edges; @@ -497,45 +510,23 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& f MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first - vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part + vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second +#endif +} +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) +{ + pstoreu_common(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { - EIGEN_DEBUG_UNALIGNED_STORE - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - // Warning: not thread safe! - Packet16uc MSQ, LSQ, edges; - Packet16uc edgeAlign, align; - - MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword - edgeAlign = vec_lvsl(0, to); // permute map to extract edges - edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges - align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) - vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first - vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part + pstoreu_common(to, from); } -#else -// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) -{ - EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to)); -} -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) -{ - EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); -} -#endif template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { @@ -643,37 +634,42 @@ template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) } // min -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +template EIGEN_STRONG_INLINE +__UNPACK_TYPE__(Packet) predux_min4(const Packet& a) { - Packet4f b, res; + Packet b, res; b = vec_min(a, vec_sld(a, a, 8)); res = vec_min(b, vec_sld(b, b, 4)); return pfirst(res); } + +template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) +{ + return predux_min4(a); +} + template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) { - Packet4i b, res; - b = vec_min(a, vec_sld(a, a, 8)); - res = vec_min(b, vec_sld(b, b, 4)); - return pfirst(res); + return predux_min4(a); } - // max -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +template EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) { - Packet4f b, res; + Packet b, res; b = vec_max(a, vec_sld(a, a, 8)); res = vec_max(b, vec_sld(b, b, 4)); return pfirst(res); } +template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) +{ + return predux_max4(a); +} + template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) { - Packet4i b, res; - b = vec_max(a, vec_sld(a, a, 8)); - res = vec_max(b, vec_sld(b, b, 4)); - return pfirst(res); + return predux_max4(a); } template @@ -730,9 +726,9 @@ struct palign_impl } }; -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet4f t0, t1, t2, t3; +template +EIGEN_DEVICE_FUNC inline void ptranpose_common(PacketBlock& kernel) { + T t0, t1, t2, t3; t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); @@ -743,29 +739,23 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = vec_mergel(t1, t3); } -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet4i t0, t1, t2, t3; - t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); - t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); - t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); - t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); - kernel.packet[0] = vec_mergeh(t0, t2); - kernel.packet[1] = vec_mergel(t0, t2); - kernel.packet[2] = vec_mergeh(t1, t3); - kernel.packet[3] = vec_mergel(t1, t3); +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { ptranpose_common(kernel); } + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { ptranpose_common(kernel); } + +template EIGEN_STRONG_INLINE +Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); + return vec_sel(elsePacket, thenPacket, mask); } template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; - Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); - return vec_sel(elsePacket, thenPacket, mask); + return pblend4(ifPacket, thenPacket, elsePacket); } template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; - Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); - return vec_sel(elsePacket, thenPacket, mask); + return pblend4(ifPacket, thenPacket, elsePacket); } @@ -785,6 +775,8 @@ static Packet2l p2l_ZERO = reinterpret_cast(p4i_ZERO); static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO = reinterpret_cast(p4f_ZERO); static Packet2d p2d_MZERO = { -0.0, -0.0 }; +static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull}; +static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull}; #ifdef _BIG_ENDIAN static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); @@ -792,16 +784,9 @@ static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_c static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ONE), reinterpret_cast(p2d_ZERO), 8)); #endif -template Packet2d vec_splat_dbl(Packet2d& a); - -template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a) +template Packet2d vec_splat_dbl(Packet2d& a) { - return reinterpret_cast(vec_perm(a, a, p16uc_PSET64_HI)); -} - -template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a) -{ - return reinterpret_cast(vec_perm(a, a, p16uc_PSET64_LO)); + return vec_splat(a, index); } template<> struct packet_traits : default_packet_traits @@ -826,7 +811,11 @@ template<> struct packet_traits : default_packet_traits HasLog = 0, HasExp = 1, HasSqrt = 1, +#if !EIGEN_COMP_CLANG HasRsqrt = 1, +#else + HasRsqrt = 0, +#endif HasRound = 1, HasFloor = 1, HasCeil = 1, @@ -863,21 +852,13 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD -#ifdef __VSX__ - return vec_vsx_ld(0, from); -#else - return vec_ld(0, from); -#endif + return vec_xl(0, const_cast(from)); // cast needed by Clang } template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ - vec_vsx_st(from, 0, to); -#else - vec_st(from, 0, to); -#endif + vec_xst(from, 0, to); } template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { @@ -889,24 +870,23 @@ template<> EIGEN_STRONG_INLINE void pbroadcast4(const double *a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) { - a1 = pload(a); - a0 = vec_splat_dbl<0>(a1); - a1 = vec_splat_dbl<1>(a1); - a3 = pload(a+2); - a2 = vec_splat_dbl<0>(a3); - a3 = vec_splat_dbl<1>(a3); + //This way is faster than vec_splat (at least for doubles in Power 9) + a0 = pset1(a[0]); + a1 = pset1(a[1]); + a2 = pset1(a[2]); + a3 = pset1(a[3]); } template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; pstore(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -918,7 +898,14 @@ template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) +{ +#ifdef __POWER8_VECTOR__ + return vec_neg(a); +#else + return vec_xor(a, p2d_MZERO); +#endif +} template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } @@ -950,14 +937,24 @@ template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) +{ + Packet2d t = vec_add(reinterpret_cast(vec_or(vec_and(reinterpret_cast(a), p2ul_SIGN), p2ul_PREV0DOT5)), a); + Packet2d res; + + __asm__("xvrdpiz %x0, %x1\n\t" + : "=&wa" (res) + : "wa" (t)); + + return res; +} template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { - EIGEN_DEBUG_ALIGNED_LOAD - return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from)); + EIGEN_DEBUG_UNALIGNED_LOAD + return vec_xl(0, const_cast(from)); } template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) @@ -970,13 +967,13 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { - EIGEN_DEBUG_ALIGNED_STORE - vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); + EIGEN_DEBUG_UNALIGNED_STORE + vec_xst(from, 0, to); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {