diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 8bc8fb535..4ba322a32 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -270,6 +270,14 @@ Packet psqrt(const Packet& a) { return sqrt(a); } * The following functions might not have to be overwritten for vectorized types ***************************************************************************/ +/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */ +// NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type) +template +inline void pstore1(typename unpacket_traits::type* to, const typename unpacket_traits::type& a) +{ + pstore(to, pset1(a)); +} + /** \internal \returns a * b + c (coeff-wise) */ template inline Packet pmadd(const Packet& a, diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 43ad28d42..908e27368 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -305,6 +305,19 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, _mm_castps_pd(from)); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, _mm_castsi128_pd(from)); } +// some compilers might be tempted to perform multiple moves instead of using a vector path. +template<> EIGEN_STRONG_INLINE void pstore1(float* to, const float& a) +{ + Packet4f pa = _mm_set_ss(a); + pstore(to, vec4f_swizzle1(pa,0,0,0,0)); +} +// some compilers might be tempted to perform multiple moves instead of using a vector path. +template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& a) +{ + Packet2d pa = _mm_set_sd(a); + pstore(to, vec2d_swizzle1(pa,0,0)); +} + template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 25ca39d76..2116dcc74 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -199,7 +199,7 @@ public: EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) { for(DenseIndex k=0; k(rhs[k])); + pstore1(&b[k*RhsPacketSize], rhs[k]); } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const @@ -270,7 +270,7 @@ public: EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) { for(DenseIndex k=0; k(rhs[k])); + pstore1(&b[k*RhsPacketSize], rhs[k]); } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const @@ -363,8 +363,8 @@ public: { if(Vectorizable) { - pstore((RealScalar*)&b[k*ResPacketSize*2+0], pset1(real(rhs[k]))); - pstore((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], pset1(imag(rhs[k]))); + pstore1((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); + pstore1((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); } else b[k] = rhs[k]; @@ -475,7 +475,7 @@ public: EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) { for(DenseIndex k=0; k(rhs[k])); + pstore1(&b[k*RhsPacketSize], rhs[k]); } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const @@ -1009,12 +1009,7 @@ EIGEN_ASM_COMMENT("mybegin4"); for(Index j2=packet_cols; j2(blB[k])); - } + traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB); for(Index i=0; i