Patch PPC PacketMath from 3.4.

This is to fix failing tests for PPC due to UB on loads.
This commit is contained in:
Antonio Sanchez
2025-03-07 20:59:30 -08:00
parent 49bd503308
commit ce950ca2db
2 changed files with 312 additions and 265 deletions

View File

@@ -15,7 +15,9 @@ namespace Eigen {
namespace internal {
static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
inline Packet4ui p4ui_CONJ_XOR() {
return vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
}
#ifdef __VSX__
#if defined(_BIG_ENDIAN)
static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
@@ -29,8 +31,54 @@ static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (P
//---------- float ----------
struct Packet2cf
{
EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {}
EIGEN_STRONG_INLINE explicit Packet2cf() {}
EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b)
{
Packet4f v1, v2;
// Permute and multiply the real parts of a and b
v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
// Get the imaginary parts of a
v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
// multiply a_re * b
v1 = vec_madd(v1, b.v, p4f_ZERO);
// multiply a_im * b and get the conjugate result
v2 = vec_madd(v2, b.v, p4f_ZERO);
v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR())));
// permute back to a proper order
v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
return Packet2cf(padd<Packet4f>(v1, v2));
}
EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
v = pmul(Packet2cf(*this), b).v;
return *this;
}
EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
return Packet2cf(*this) *= b;
}
EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
v = padd(v, b.v);
return *this;
}
EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
return Packet2cf(*this) += b;
}
EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
v = psub(v, b.v);
return *this;
}
EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
return Packet2cf(*this) -= b;
}
EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
return Packet2cf(-v);
}
Packet4f v;
};
@@ -82,14 +130,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f
template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
{
std::complex<float> EIGEN_ALIGN16 af[2];
EIGEN_ALIGN16 std::complex<float> af[2];
af[0] = from[0*stride];
af[1] = from[1*stride];
return pload<Packet2cf>(af);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
{
std::complex<float> EIGEN_ALIGN16 af[2];
EIGEN_ALIGN16 std::complex<float> af[2];
pstore<std::complex<float> >((std::complex<float> *) af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
@@ -98,26 +146,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); }
template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
Packet4f v1, v2;
// Permute and multiply the real parts of a and b
v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
// Get the imaginary parts of a
v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
// multiply a_re * b
v1 = vec_madd(v1, b.v, p4f_ZERO);
// multiply a_im * b and get the conjugate result
v2 = vec_madd(v2, b.v, p4f_ZERO);
v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
// permute back to a proper order
v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
return Packet2cf(padd<Packet4f>(v1, v2));
}
template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR()))); }
template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
@@ -128,7 +157,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{
std::complex<float> EIGEN_ALIGN16 res[2];
EIGEN_ALIGN16 std::complex<float> res[2];
pstore((float *)&res, a.v);
return res[0];
@@ -152,7 +181,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packe
template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
{
Packet4f b1, b2;
#ifdef _BIG_ENDIAN
#ifdef _BIG_ENDIAN
b1 = vec_sld(vecs[0].v, vecs[1].v, 8);
b2 = vec_sld(vecs[1].v, vecs[0].v, 8);
#else
@@ -260,6 +289,51 @@ struct Packet1cd
{
EIGEN_STRONG_INLINE Packet1cd() {}
EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b)
{
Packet2d a_re, a_im, v1, v2;
// Permute and multiply the real parts of a and b
a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
// Get the imaginary parts of a
a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
// multiply a_re * b
v1 = vec_madd(a_re, b.v, p2d_ZERO);
// multiply a_im * b and get the conjugate result
v2 = vec_madd(a_im, b.v, p2d_ZERO);
v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
return Packet1cd(padd<Packet2d>(v1, v2));
}
EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
v = pmul(Packet1cd(*this), b).v;
return *this;
}
EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
return Packet1cd(*this) *= b;
}
EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
v = padd(v, b.v);
return *this;
}
EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
return Packet1cd(*this) += b;
}
EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
v = psub(v, b.v);
return *this;
}
EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
return Packet1cd(*this) -= b;
}
EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
return Packet1cd(-v);
}
Packet2d v;
};
@@ -296,19 +370,13 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<
template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
{
std::complex<double> EIGEN_ALIGN16 af[2];
af[0] = from[0*stride];
af[1] = from[1*stride];
return pload<Packet1cd>(af);
return pload<Packet1cd>(from);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
{
std::complex<double> EIGEN_ALIGN16 af[2];
pstore<std::complex<double> >(af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
pstore<std::complex<double> >(to, from);
}
template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
@@ -316,24 +384,6 @@ template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, con
template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }
template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
Packet2d a_re, a_im, v1, v2;
// Permute and multiply the real parts of a and b
a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
// Get the imaginary parts of a
a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
// multiply a_re * b
v1 = vec_madd(a_re, b.v, p2d_ZERO);
// multiply a_im * b and get the conjugate result
v2 = vec_madd(a_im, b.v, p2d_ZERO);
v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1));
return Packet1cd(padd<Packet2d>(v1, v2));
}
template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
@@ -345,7 +395,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
{
std::complex<double> EIGEN_ALIGN16 res[2];
EIGEN_ALIGN16 std::complex<double> res[2];
pstore<std::complex<double> >(res, a);
return res[0];

View File

@@ -22,10 +22,6 @@ namespace internal {
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#endif
// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
@@ -40,9 +36,8 @@ typedef __vector unsigned char Packet16uc;
// We don't want to write the same code all the time, but we need to reuse the constants
// and it doesn't really work to declare them global, so we define macros instead
#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
Packet4f p4f_##NAME = {X, X, X, X}
#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
Packet4i p4i_##NAME = vec_splat_s32(X)
@@ -64,7 +59,7 @@ typedef __vector unsigned char Packet16uc;
#define DST_CHAN 1
#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
// These constants are endian-agnostic
static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
@@ -77,21 +72,15 @@ static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)
static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
#endif
static Packet4ui p4ui_SIGN = {0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u};
static Packet4ui p4ui_PREV0DOT5 = {0x3EFFFFFFu, 0x3EFFFFFFu, 0x3EFFFFFFu, 0x3EFFFFFFu};
static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
// Mask alignment
#ifdef __PPC64__
#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
#else
#define _EIGEN_MASK_ALIGNMENT 0xfffffff0
#endif
#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
// Handle endianness properly while loading constants
// Define global static constants:
#ifdef _BIG_ENDIAN
@@ -235,112 +224,131 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
return s;
}
// Need to define them first or we get specialization after instantiation errors
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
template <typename Packet>
EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from)
{
// some versions of GCC throw "unused-but-set-parameter".
// ignoring these warnings for now.
EIGEN_UNUSED_VARIABLE(from);
EIGEN_DEBUG_ALIGNED_LOAD
#ifdef __VSX__
return vec_vsx_ld(0, from);
return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
#else
return vec_ld(0, from);
#endif
}
// Need to define them first or we get specialization after instantiation errors
template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
{
return pload_common<Packet4f>(from);
}
template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
{
EIGEN_DEBUG_ALIGNED_LOAD
return pload_common<Packet4i>(from);
}
template <typename Packet>
EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
// some versions of GCC throw "unused-but-set-parameter" (float *to).
// ignoring these warnings for now.
EIGEN_UNUSED_VARIABLE(to);
EIGEN_DEBUG_ALIGNED_STORE
#ifdef __VSX__
return vec_vsx_ld(0, from);
vec_xst(from, 0, to);
#else
return vec_ld(0, from);
vec_st(from, 0, to);
#endif
}
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
{
EIGEN_DEBUG_ALIGNED_STORE
#ifdef __VSX__
vec_vsx_st(from, 0, to);
#else
vec_st(from, 0, to);
#endif
pstore_common<Packet4f>(to, from);
}
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from)
{
EIGEN_DEBUG_ALIGNED_STORE
#ifdef __VSX__
vec_vsx_st(from, 0, to);
#else
vec_st(from, 0, to);
#endif
pstore_common<Packet4i>(to, from);
}
template<typename Packet>
EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
{
Packet v = {from, from, from, from};
return v;
}
template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
Packet4f v = {from, from, from, from};
return v;
return pset1_size4<Packet4f>(from);
}
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
Packet4i v = {from, from, from, from};
return v;
return pset1_size4<Packet4i>(from);
}
template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet4f>(const float *a,
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
template<typename Packet> EIGEN_STRONG_INLINE void
pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
Packet& a0, Packet& a1, Packet& a2, Packet& a3)
{
a3 = pload<Packet4f>(a);
a3 = pload<Packet>(a);
a0 = vec_splat(a3, 0);
a1 = vec_splat(a3, 1);
a2 = vec_splat(a3, 2);
a3 = vec_splat(a3, 3);
}
template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet4f>(const float *a,
Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
{
pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
}
template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet4i>(const int *a,
Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
{
a3 = pload<Packet4i>(a);
a0 = vec_splat(a3, 0);
a1 = vec_splat(a3, 1);
a2 = vec_splat(a3, 2);
a3 = vec_splat(a3, 3);
pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
}
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride)
{
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
a[0] = from[0*stride];
a[1] = from[1*stride];
a[2] = from[2*stride];
a[3] = from[3*stride];
return pload<Packet>(a);
}
template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
{
float EIGEN_ALIGN16 af[4];
af[0] = from[0*stride];
af[1] = from[1*stride];
af[2] = from[2*stride];
af[3] = from[3*stride];
return pload<Packet4f>(af);
return pgather_common<Packet4f>(from, stride);
}
template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
{
int EIGEN_ALIGN16 ai[4];
ai[0] = from[0*stride];
ai[1] = from[1*stride];
ai[2] = from[2*stride];
ai[3] = from[3*stride];
return pload<Packet4i>(ai);
return pgather_common<Packet4i>(from, stride);
}
template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride)
{
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4];
pstore<__UNPACK_TYPE__(Packet)>(a, from);
to[0*stride] = a[0];
to[1*stride] = a[1];
to[2*stride] = a[2];
to[3*stride] = a[3];
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
{
float EIGEN_ALIGN16 af[4];
pstore<float>(af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
to[2*stride] = af[2];
to[3*stride] = af[3];
pscatter_size4<Packet4f>(to, from, stride);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
{
int EIGEN_ALIGN16 ai[4];
pstore<int>((int *)ai, from);
to[0*stride] = ai[0];
to[1*stride] = ai[1];
to[2*stride] = ai[2];
to[3*stride] = ai[3];
pscatter_size4<Packet4i>(to, from, stride);
}
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
@@ -424,66 +432,71 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
{
Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
Packet4f res;
#ifdef __VSX__
__asm__("xvrspiz %x0, %x1\n\t"
: "=&wa" (res)
: "wa" (t));
#else
__asm__("vrfiz %0, %1\n\t"
: "=v" (res)
: "v" (t));
#endif
return res;
}
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); }
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
#ifdef _BIG_ENDIAN
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
{
EIGEN_DEBUG_ALIGNED_LOAD
Packet16uc MSQ, LSQ;
Packet16uc mask;
MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
mask = vec_lvsl(0, from); // create the permute mask
return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data
}
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
{
EIGEN_DEBUG_ALIGNED_LOAD
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
Packet16uc MSQ, LSQ;
Packet16uc mask;
MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
mask = vec_lvsl(0, from); // create the permute mask
return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data
}
EIGEN_DEBUG_UNALIGNED_LOAD
#ifdef __VSX__
return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
#else
// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
Packet16uc mask = vec_lvsl(0, from); // create the permute mask
Packet16uc MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
Packet16uc LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
//TODO: Add static_cast here
return (Packet) vec_perm(MSQ, LSQ, mask); // align the data
#endif
}
template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
{
EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
return ploadu_common<Packet4f>(from);
}
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
{
return ploadu_common<Packet4i>(from);
}
#endif
template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from)
{
Packet p;
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet>(from);
else p = ploadu<Packet>(from);
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
}
template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
{
Packet4f p;
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from);
else p = ploadu<Packet4f>(from);
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
return ploaddup_common<Packet4f>(from);
}
template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
{
Packet4i p;
if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from);
else p = ploadu<Packet4i>(from);
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
return ploaddup_common<Packet4i>(from);
}
#ifdef _BIG_ENDIAN
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from)
{
EIGEN_DEBUG_UNALIGNED_STORE
#ifdef __VSX__
vec_xst(from, 0, to);
#else
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
// Warning: not thread safe!
Packet16uc MSQ, LSQ, edges;
@@ -497,45 +510,23 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& f
MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ)
LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second
#endif
}
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
{
pstoreu_common<Packet4f>(to, from);
}
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
{
EIGEN_DEBUG_UNALIGNED_STORE
// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
// Warning: not thread safe!
Packet16uc MSQ, LSQ, edges;
Packet16uc edgeAlign, align;
MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
edgeAlign = vec_lvsl(0, to); // permute map to extract edges
edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges
align = vec_lvsr( 0, to ); // permute map to misalign data
MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ)
LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ)
vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
pstoreu_common<Packet4i>(to, from);
}
#else
// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
{
EIGEN_DEBUG_ALIGNED_STORE
vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to));
}
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
{
EIGEN_DEBUG_ALIGNED_STORE
vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
}
#endif
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; }
template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
{
@@ -643,37 +634,42 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
}
// min
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
template<typename Packet> EIGEN_STRONG_INLINE
__UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
{
Packet4f b, res;
Packet b, res;
b = vec_min(a, vec_sld(a, a, 8));
res = vec_min(b, vec_sld(b, b, 4));
return pfirst(res);
}
template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
{
return predux_min4<Packet4f>(a);
}
template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
{
Packet4i b, res;
b = vec_min(a, vec_sld(a, a, 8));
res = vec_min(b, vec_sld(b, b, 4));
return pfirst(res);
return predux_min4<Packet4i>(a);
}
// max
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a)
{
Packet4f b, res;
Packet b, res;
b = vec_max(a, vec_sld(a, a, 8));
res = vec_max(b, vec_sld(b, b, 4));
return pfirst(res);
}
template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
{
return predux_max4<Packet4f>(a);
}
template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
{
Packet4i b, res;
b = vec_max(a, vec_sld(a, a, 8));
res = vec_max(b, vec_sld(b, b, 4));
return pfirst(res);
return predux_max4<Packet4i>(a);
}
template<int Offset>
@@ -730,9 +726,9 @@ struct palign_impl<Offset,Packet4i>
}
};
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4f,4>& kernel) {
Packet4f t0, t1, t2, t3;
template <typename T>
EIGEN_DEVICE_FUNC inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
T t0, t1, t2, t3;
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -743,29 +739,23 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
kernel.packet[3] = vec_mergel(t1, t3);
}
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<Packet4i,4>& kernel) {
Packet4i t0, t1, t2, t3;
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
kernel.packet[0] = vec_mergeh(t0, t2);
kernel.packet[1] = vec_mergel(t0, t2);
kernel.packet[2] = vec_mergeh(t1, t3);
kernel.packet[3] = vec_mergel(t1, t3);
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
template<typename Packet> EIGEN_STRONG_INLINE
Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
return vec_sel(elsePacket, thenPacket, mask);
}
template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
return vec_sel(elsePacket, thenPacket, mask);
return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
}
template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
return vec_sel(elsePacket, thenPacket, mask);
return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
}
@@ -785,6 +775,8 @@ static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
static Packet2d p2d_ONE = { 1.0, 1.0 };
static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
static Packet2d p2d_MZERO = { -0.0, -0.0 };
static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};
#ifdef _BIG_ENDIAN
static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
@@ -792,16 +784,9 @@ static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_c
static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
#endif
template<int index> Packet2d vec_splat_dbl(Packet2d& a);
template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a)
template<int index> Packet2d vec_splat_dbl(Packet2d& a)
{
return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI));
}
template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a)
{
return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO));
return vec_splat(a, index);
}
template<> struct packet_traits<double> : default_packet_traits
@@ -826,7 +811,11 @@ template<> struct packet_traits<double> : default_packet_traits
HasLog = 0,
HasExp = 1,
HasSqrt = 1,
#if !EIGEN_COMP_CLANG
HasRsqrt = 1,
#else
HasRsqrt = 0,
#endif
HasRound = 1,
HasFloor = 1,
HasCeil = 1,
@@ -863,21 +852,13 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
{
EIGEN_DEBUG_ALIGNED_LOAD
#ifdef __VSX__
return vec_vsx_ld(0, from);
#else
return vec_ld(0, from);
#endif
return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
}
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
{
EIGEN_DEBUG_ALIGNED_STORE
#ifdef __VSX__
vec_vsx_st(from, 0, to);
#else
vec_st(from, 0, to);
#endif
vec_xst(from, 0, to);
}
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
@@ -889,24 +870,23 @@ template<> EIGEN_STRONG_INLINE void
pbroadcast4<Packet2d>(const double *a,
Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
{
a1 = pload<Packet2d>(a);
a0 = vec_splat_dbl<0>(a1);
a1 = vec_splat_dbl<1>(a1);
a3 = pload<Packet2d>(a+2);
a2 = vec_splat_dbl<0>(a3);
a3 = vec_splat_dbl<1>(a3);
//This way is faster than vec_splat (at least for doubles in Power 9)
a0 = pset1<Packet2d>(a[0]);
a1 = pset1<Packet2d>(a[1]);
a2 = pset1<Packet2d>(a[2]);
a3 = pset1<Packet2d>(a[3]);
}
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
{
double EIGEN_ALIGN16 af[2];
EIGEN_ALIGN16 double af[2];
af[0] = from[0*stride];
af[1] = from[1*stride];
return pload<Packet2d>(af);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
{
double EIGEN_ALIGN16 af[2];
EIGEN_ALIGN16 double af[2];
pstore<double>(af, from);
to[0*stride] = af[0];
to[1*stride] = af[1];
@@ -918,7 +898,14 @@ template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
{
#ifdef __POWER8_VECTOR__
return vec_neg(a);
#else
return vec_xor(a, p2d_MZERO);
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
@@ -950,14 +937,24 @@ template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
{
Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
Packet2d res;
__asm__("xvrdpiz %x0, %x1\n\t"
: "=&wa" (res)
: "wa" (t));
return res;
}
template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
{
EIGEN_DEBUG_ALIGNED_LOAD
return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from));
EIGEN_DEBUG_UNALIGNED_LOAD
return vec_xl(0, const_cast<double*>(from));
}
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
@@ -970,13 +967,13 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
{
EIGEN_DEBUG_ALIGNED_STORE
vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
EIGEN_DEBUG_UNALIGNED_STORE
vec_xst(from, 0, to);
}
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
{