diff --git a/Eigen/src/Core/CacheFriendlyProduct.h b/Eigen/src/Core/CacheFriendlyProduct.h index bad018e8e..8c012b58f 100644 --- a/Eigen/src/Core/CacheFriendlyProduct.h +++ b/Eigen/src/Core/CacheFriendlyProduct.h @@ -359,19 +359,6 @@ static void ei_cache_friendly_product( #endif // EIGEN_EXTERN_INSTANTIATIONS -template -inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset) -{ - typedef typename ei_packet_traits::type Packet; - const int PacketSize = ei_packet_traits::size; - const int PacketAlignedMask = PacketSize-1; - const bool Vectorized = PacketSize>1; - return Vectorized - ? std::min( (PacketSize - ((size_t(ptr)/sizeof(Scalar)) & PacketAlignedMask)) - & PacketAlignedMask, maxOffset) - : 0; -} - /* Optimized col-major matrix * vector product: * This algorithm processes 4 columns at onces that allows to both reduce * the number of load/stores of the result by a factor 4 and to reduce @@ -420,7 +407,7 @@ EIGEN_DONT_INLINE static void ei_cache_friendly_product_colmajor_times_vector( // we cannot assume the first element is aligned because of sub-matrices const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size); - ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0); + ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size inline void ei_pstoret ei_pstoreu(to, from); } +/** \internal \returns the number of elements which have to be skipped such that data are aligned */ +template +inline static int ei_alignmentOffset(const Scalar* ptr, int maxOffset) +{ + typedef typename ei_packet_traits::type Packet; + const int PacketSize = ei_packet_traits::size; + const int PacketAlignedMask = PacketSize-1; + const bool Vectorized = PacketSize>1; + return Vectorized + ? std::min( (PacketSize - ((size_t(ptr)/sizeof(Scalar)) & PacketAlignedMask)) + & PacketAlignedMask, maxOffset) + : 0; +} + #endif // EIGEN_DUMMY_PACKET_MATH_H diff --git a/Eigen/src/Core/InverseProduct.h b/Eigen/src/Core/InverseProduct.h index cfab54228..cd27c4ddf 100755 --- a/Eigen/src/Core/InverseProduct.h +++ b/Eigen/src/Core/InverseProduct.h @@ -98,6 +98,8 @@ struct ei_trisolve_selector }; // forward substitution, col-major +// FIXME the Lower and Upper specialization could be merged using a small helper class +// performing reflexions on the coordinates... template struct ei_trisolve_selector { @@ -138,6 +140,8 @@ struct ei_trisolve_selector * other.col(c).end(size-endBlock) += (lhs.block(endBlock, startBlock, size-endBlock, endBlock-startBlock) * * other.col(c).block(startBlock,endBlock-startBlock)).lazy(); */ + // FIXME this is cool but what about conjugate/adjoint expressions ? do we want to evaluate them ? + // this is a more general problem though. ei_cache_friendly_product_colmajor_times_vector( size-endBlock, &(lhs.const_cast_derived().coeffRef(endBlock,startBlock)), lhs.stride(), btmp, &(other.coeffRef(endBlock,c))); diff --git a/Eigen/src/Core/Product.h b/Eigen/src/Core/Product.h index 9a0365939..60a1d0a35 100644 --- a/Eigen/src/Core/Product.h +++ b/Eigen/src/Core/Product.h @@ -379,6 +379,7 @@ struct ei_product_coeff_vectorized_dyn_selector }; // NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower +// NOTE maybe they are now useless since we have a specialization for Block template struct ei_product_coeff_vectorized_dyn_selector { @@ -406,7 +407,7 @@ struct ei_product_coeff_vectorized_dyn_selector template struct ei_product_coeff_vectorized_dyn_selector { - inline static void run(int row, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + inline static void run(int /*row*/, int /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) { res = ei_dot_impl< Lhs,