From 9d2bf35a05b21d0203201a0b72b54022cae24670 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Sat, 12 Feb 2011 16:40:09 +0100 Subject: [PATCH] implement optimized ploadu for MSVC10: this also fix bad code generation in gebp_kernel :) --- Eigen/src/Core/arch/SSE/PacketMath.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index fa499a870..0872a04f4 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -222,7 +222,20 @@ template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { E template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } #if defined(_MSC_VER) - template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ps(from); } + template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD + #if (_MSC_VER==1600) + // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps + // (i.e., it does not generate an unaligned load!! + // TODO On most architectures this version should also be faster than a single _mm_loadu_ps + // so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so... + __m128 res = _mm_loadl_pi(res, (const __m64*)(from)); + res = _mm_loadh_pi(res, (const __m64*)(from+2)); + return res; + #else + return _mm_loadu_ps(from); + #endif + } template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); } #else