diff --git a/Eigen/CoreDeclarations b/Eigen/CoreDeclarations index 6e5fb2d9e..223a8d416 100644 --- a/Eigen/CoreDeclarations +++ b/Eigen/CoreDeclarations @@ -23,15 +23,12 @@ #ifdef __SSSE3__ #include #endif - -/*** Disable AltiVec code for now as it's out of date #elif (defined __ALTIVEC__) #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_ALTIVEC #include // We _need_ to #undef bool as it's defined in for some reason. #undef bool -***/ #endif #endif diff --git a/Eigen/src/Array/Functors.h b/Eigen/src/Array/Functors.h index 31bcca899..1e3804311 100644 --- a/Eigen/src/Array/Functors.h +++ b/Eigen/src/Array/Functors.h @@ -36,6 +36,8 @@ template struct ei_scalar_add_op { typedef typename ei_packet_traits::type PacketScalar; + // FIXME default copy constructors seems bugged with std::complex<> + inline ei_scalar_add_op(const ei_scalar_add_op& other) : m_other(other.m_other) { } inline ei_scalar_add_op(const Scalar& other) : m_other(other) { } inline Scalar operator() (const Scalar& a) const { return a + m_other; } inline const PacketScalar packetOp(const PacketScalar& a) const @@ -131,6 +133,8 @@ struct ei_functor_traits > */ template struct ei_scalar_pow_op { + // FIXME default copy constructors seems bugged with std::complex<> + inline ei_scalar_pow_op(const ei_scalar_pow_op& other) : m_exponent(other.m_exponent) { } inline ei_scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {} inline Scalar operator() (const Scalar& a) const { return ei_pow(a, m_exponent); } const Scalar m_exponent; diff --git a/Eigen/src/Core/Functors.h b/Eigen/src/Core/Functors.h index b911200d9..f0bc1fd4f 100644 --- a/Eigen/src/Core/Functors.h +++ b/Eigen/src/Core/Functors.h @@ -143,8 +143,6 @@ struct ei_functor_traits > { PacketAccess = ei_packet_traits::size>1 #if (defined EIGEN_VECTORIZE_SSE) && NumTraits::HasFloatingPoint - #elif (defined EIGEN_VECTORIZE_ALTIVEC) - && 0 #endif }; }; @@ -260,6 +258,8 @@ struct ei_functor_traits > template struct ei_scalar_multiple_op { typedef typename ei_packet_traits::type PacketScalar; + // FIXME default copy constructors seems bugged with std::complex<> + inline ei_scalar_multiple_op(const ei_scalar_multiple_op& other) : m_other(other.m_other) { } inline ei_scalar_multiple_op(const Scalar& other) : m_other(other) { } inline Scalar operator() (const Scalar& a) const { return a * m_other; } inline const PacketScalar packetOp(const PacketScalar& a) const @@ -273,6 +273,8 @@ struct ei_functor_traits > template struct ei_scalar_quotient1_impl { typedef typename ei_packet_traits::type PacketScalar; + // FIXME default copy constructors seems bugged with std::complex<> + inline ei_scalar_quotient1_impl(const ei_scalar_quotient1_impl& other) : m_other(other.m_other) { } inline ei_scalar_quotient1_impl(const Scalar& other) : m_other(static_cast(1) / other) {} inline Scalar operator() (const Scalar& a) const { return a * m_other; } inline const PacketScalar packetOp(const PacketScalar& a) const @@ -285,10 +287,11 @@ struct ei_functor_traits > template struct ei_scalar_quotient1_impl { + // FIXME default copy constructors seems bugged with std::complex<> + inline ei_scalar_quotient1_impl(const ei_scalar_quotient1_impl& other) : m_other(other.m_other) { } inline ei_scalar_quotient1_impl(const Scalar& other) : m_other(other) {} inline Scalar operator() (const Scalar& a) const { return a / m_other; } const Scalar m_other; - enum { Cost = 2 * NumTraits::MulCost }; }; template struct ei_functor_traits > diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index cde368b03..9b2c29d79 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -2,7 +2,6 @@ // for linear algebra. Eigen itself is part of the KDE project. // // Copyright (C) 2008 Konstantinos Margaritis -// Copyright (C) 2008 Gael Guennebaud // // Eigen is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public @@ -35,11 +34,16 @@ typedef vector int v4i; typedef vector unsigned int v4ui; typedef vector __bool int v4bi; -static const v4i v0i = vec_splat_s32(0); -static const v4i v1i = vec_splat_s32(1); -static const v4i v16i_ = vec_splat_s32(-16); -static const v4f v0f = (v4f) v0i; -static const v4f v1f = (v4f) v1i; +// We don't want to write the same code all the time, but we need to reuse the constants +// and it doesn't really work to declare them global, so we define macros instead + +#define USE_CONST_v0i const v4i v0i = vec_splat_s32(0) +#define USE_CONST_v1i const v4i v1i = vec_splat_s32(1) +#define USE_CONST_v16i_ const v4i v16i_ = vec_splat_s32(-16) +#define USE_CONST_v0f USE_CONST_v0i; const v4f v0f = (v4f) v0i; +#define USE_CONST_v1f USE_CONST_v1i; const v4f v1f = vec_ctf(v1i, 0); +#define USE_CONST_v1i_ const v4ui v1i_ = vec_splat_u32(-1); +#define USE_CONST_v0f_ USE_CONST_v1i_; const v4f v0f_ = (v4f) vec_sl(v1i_, v1i_); template<> struct ei_packet_traits { typedef v4f type; enum {size=4}; }; template<> struct ei_packet_traits { typedef v4i type; enum {size=4}; }; @@ -54,7 +58,7 @@ std::ostream & operator <<(std::ostream & s, const v4f & v) float n[4]; } vt; vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl; + s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; return s; } @@ -65,7 +69,7 @@ std::ostream & operator <<(std::ostream & s, const v4i & v) int n[4]; } vt; vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl; + s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; return s; } @@ -76,7 +80,7 @@ std::ostream & operator <<(std::ostream & s, const v4ui & v) unsigned int n[4]; } vt; vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl; + s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; return s; } @@ -87,7 +91,7 @@ std::ostream & operator <<(std::ostream & s, const v4bi & v) unsigned int n[4]; } vt; vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3] << std::endl; + s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; return s; } @@ -97,71 +101,57 @@ template<> inline v4i ei_padd(const v4i& a, const v4i& b) { return vec_add( template<> inline v4f ei_psub(const v4f& a, const v4f& b) { return vec_sub(a,b); } template<> inline v4i ei_psub(const v4i& a, const v4i& b) { return vec_sub(a,b); } -template<> inline v4f ei_pmul(const v4f& a, const v4f& b) { return vec_madd(a,b, v0f); } +template<> inline v4f ei_pmul(const v4f& a, const v4f& b) { USE_CONST_v0f; return vec_madd(a,b, v0f); } template<> inline v4i ei_pmul(const v4i& a, const v4i& b) { - // Taken from http://developer.apple.com/hardwaredrivers/ve/algorithms.html#Multiply32 - //Set up constants - v4i bswap, low_prod, high_prod, prod, prod_, a_, b_, a1, b1; - v4i v1_, v0_, v1sel; - v1_ = (v4i) vec_splat_u32(-1); - v0_ = (v4i) vec_sl((v4ui) v1_, (v4ui) v1_); - -// std::cout << "a: " << a << std::endl; -// std::cout << "b: " << b << std::endl; - a_ = vec_sub(v0i, a); - b_ = vec_sub(v0i, b); - a1 = vec_max(a, a_); - b1 = vec_max(b, b_); - v4bi sgn = (v4bi) vec_cmplt(vec_and(vec_xor(a, b), v0_), v0i); + // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec + //Set up constants, variables + v4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel; + USE_CONST_v0i; + USE_CONST_v1i; + USE_CONST_v16i_; -// std::cout << "a: " << a1 << std::endl; -// std::cout << "b: " << b1 << std::endl; -// std::cout << "a_: " << a_ << std::endl; -// std::cout << "b_: " << b_ << std::endl; -// std::cout << "sgn(a*b): " << std::hex << sgn << std::dec << std::endl; + // Get the absolute values + a1 = vec_abs(a); + b1 = vec_abs(b); - // Do real work + // Get the signs using xor + v4bi sgn = (v4bi) vec_cmplt(vec_xor(a, b), v0i); + + // Do the multiplication for the asbolute values. bswap = (v4i) vec_rl((v4ui) b1, (v4ui) v16i_ ); low_prod = vec_mulo((vector short)a1, (vector short)b1); high_prod = vec_msum((vector short)a1, (vector short)bswap, v0i); high_prod = (v4i) vec_sl((v4ui) high_prod, (v4ui) v16i_); prod = vec_add( low_prod, high_prod ); -// std::cout << "prod: " << prod << std::endl; -// std::cout << "v0i: " << v0i << std::endl; + + // NOR the product and select only the negative elements according to the sign mask prod_ = vec_nor(prod, prod); prod_ = vec_sel(v0i, prod_, sgn); + + // Add 1 to the result to get the negative numbers v1sel = vec_sel(v0i, v1i, sgn); -// std::cout << "prod_: " << prod_ << std::endl; -// std::cout << "v1sel: " << v1sel << std::endl; prod_ = vec_add(prod_, v1sel); -// std::cout << "prod_: " << prod_ << std::endl; + + // Merge the results back to the final vector. prod = vec_sel(prod, prod_, sgn); -// std::cout << "final prod: " << prod << std::endl; + return prod; } template<> inline v4f ei_pdiv(const v4f& a, const v4f& b) { - std::cout << "a: " << a << std::endl; - std::cout << "b: " << b << std::endl; + v4f t, y_0, y_1, res; + USE_CONST_v0f; + USE_CONST_v1f; // Altivec does not offer a divide instruction, we have to do a reciprocal approximation - v4f y = vec_re(b); - std::cout << "1/b: " << y << std::endl; + y_0 = vec_re(b); + + // Do one Newton-Raphson iteration to get the needed accuracy + t = vec_nmsub(y_0, b, v1f); + y_1 = vec_madd(y_0, t, y_0); - // Set up some constants for inverse reciprocals - vector unsigned int v1_; - v4f v0_, t; - v1_ = vec_splat_u32(-1); - std::cout << "-1: " << (v4i) v1_ << std::endl; - v0_ = (v4f) vec_sl(v1_, v1_); - std::cout << "-0.0: " << v0_ << std::endl; - // Do a Newton-Raphson iteration to get the needed accuracy - t = vec_nmsub(y, b, v1f); - std::cout << "t: " << t << std::endl; - y = vec_nmsub(t, y, v0_); - v4f res = vec_madd(a, y, v0_); - std::cout << "res: " << res << std::endl; + res = vec_madd(a, y_1, v0f); return res; } @@ -184,7 +174,7 @@ template<> inline v4f ei_ploadu(const float* from) MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword mask = vec_lvsl(0, from); // create the permute mask - return (v4f) vec_perm(MSQ, LSQ, mask); // align the data + return (v4f) vec_perm(MSQ, LSQ, mask); // align the data } template<> inline v4i ei_ploadu(const int* from) @@ -336,6 +326,7 @@ inline v4i ei_preduxp(const v4i* vecs) inline int ei_predux(const v4i& a) { + USE_CONST_v0i; v4i sum; sum = vec_sums(a, v0i); sum = vec_sld(sum, v0i, 12);