diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h index 767e2d554..9feb38f81 100644 --- a/Eigen/src/Core/arch/AVX/TypeCasting.h +++ b/Eigen/src/Core/arch/AVX/TypeCasting.h @@ -56,6 +56,40 @@ struct type_casting_traits : vectorized_type_casting_traits(tmp), lo); + _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp + 4), hi); + return _mm256_loadu_si256(reinterpret_cast(tmp)); +#endif +#else + return _mm256_set_m128i(hi, lo); +#endif +} + template <> EIGEN_STRONG_INLINE Packet16b pcast(const Packet8f& a, const Packet8f& b) { __m256 nonzero_a = _mm256_cmp_ps(a, pzero(a), _CMP_NEQ_UQ); @@ -109,7 +143,7 @@ EIGEN_STRONG_INLINE Packet8i pcast(const Packet8f& a) { template <> EIGEN_STRONG_INLINE Packet8i pcast(const Packet4d& a, const Packet4d& b) { - return _mm256_set_m128i(_mm256_cvttpd_epi32(b), _mm256_cvttpd_epi32(a)); + return _eigen_mm256_set_m128i(_mm256_cvttpd_epi32(b), _mm256_cvttpd_epi32(a)); } template <> @@ -124,7 +158,7 @@ EIGEN_STRONG_INLINE Packet8f pcast(const Packet8i& a) { template <> EIGEN_STRONG_INLINE Packet8f pcast(const Packet4d& a, const Packet4d& b) { - return _mm256_set_m128(_mm256_cvtpd_ps(b), _mm256_cvtpd_ps(a)); + return _eigen_mm256_set_m128(_mm256_cvtpd_ps(b), _mm256_cvtpd_ps(a)); } template <> @@ -249,7 +283,7 @@ EIGEN_STRONG_INLINE Packet4d pcast(const Packet4l& a) { template <> EIGEN_STRONG_INLINE Packet4d pcast(const Packet2l& a, const Packet2l& b) { - return _mm256_set_m128d((pcast(b)), (pcast(a))); + return _eigen_mm256_set_m128d((pcast(b)), (pcast(a))); } template <> diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 75a83e1ea..bfcab5c8d 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -44,6 +44,54 @@ typedef eigen_packet_wrapper<__m512i, 6> Packet32s; typedef eigen_packet_wrapper<__m256i, 6> Packet16s; typedef eigen_packet_wrapper<__m128i, 6> Packet8s; +EIGEN_STRONG_INLINE Packet16i _eigen_mm512_loadu_epi32(const int* from) { +#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010) + return _mm512_loadu_si512(reinterpret_cast(from)); +#else + return _mm512_loadu_epi32(from); +#endif +} + +EIGEN_STRONG_INLINE Packet16i _eigen_mm512_loadu_epi64(const int64_t* from) { +#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010) + return _mm512_loadu_si512(reinterpret_cast(from)); +#else + return _mm512_loadu_epi64(from); +#endif +} + +EIGEN_STRONG_INLINE void _eigen_mm512_storeu_epi32(void* to, const Packet16i& from) { +#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010) + _mm512_storeu_si512(to, from); +#else + _mm512_storeu_epi32(to, from); +#endif +} + +EIGEN_STRONG_INLINE void _eigen_mm512_storeu_epi64(void* to, const Packet16i& from) { +#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010) + _mm512_storeu_si512(to, from); +#else + _mm512_storeu_epi64(to, from); +#endif +} + +EIGEN_STRONG_INLINE void _eigen_mm256_storeu_epi32(void* to, const __m256i& from) { +#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010) + _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); +#else + _mm256_storeu_epi32(to, from); +#endif +} + +EIGEN_STRONG_INLINE void _eigen_mm_storeu_epi32(void* to, const __m128i& from) { +#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 1010) + _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); +#else + _mm_storeu_epi32(to, from); +#endif +} + template <> struct is_arithmetic<__m512> { enum { value = true }; @@ -1033,11 +1081,11 @@ EIGEN_STRONG_INLINE Packet8d ploadu(const double* from) { } template <> EIGEN_STRONG_INLINE Packet16i ploadu(const int* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_epi32(from); + EIGEN_DEBUG_UNALIGNED_LOAD return _eigen_mm512_loadu_epi32(from); } template <> EIGEN_STRONG_INLINE Packet8l ploadu(const int64_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_epi64(from); + EIGEN_DEBUG_UNALIGNED_LOAD return _eigen_mm512_loadu_epi64(from); } template <> @@ -1158,11 +1206,11 @@ EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet8d& from) { } template <> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet16i& from) { - EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi32(to, from); + EIGEN_DEBUG_UNALIGNED_STORE _eigen_mm512_storeu_epi32(to, from); } template <> EIGEN_STRONG_INLINE void pstoreu(int64_t* to, const Packet8l& from) { - EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi64(to, from); + EIGEN_DEBUG_UNALIGNED_STORE _eigen_mm512_storeu_epi64(to, from); } template <> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet16f& from, uint16_t umask) { @@ -2997,19 +3045,19 @@ EIGEN_STRONG_INLINE void pstore(numext::int16_t* out, template <> EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* out, const Packet32s& x) { EIGEN_DEBUG_UNALIGNED_STORE - _mm512_storeu_epi32(out, x); + _eigen_mm512_storeu_epi32(out, x); } template <> EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* out, const Packet16s& x) { EIGEN_DEBUG_UNALIGNED_STORE - _mm256_storeu_epi32(out, x); + _eigen_mm256_storeu_epi32(out, x); } template <> EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* out, const Packet8s& x) { EIGEN_DEBUG_UNALIGNED_STORE - _mm_storeu_epi32(out, x); + _eigen_mm_storeu_epi32(out, x); } template <> diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h index fc55fd861..e970d43d2 100644 --- a/Eigen/src/Core/arch/AVX512/TypeCasting.h +++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h @@ -52,9 +52,17 @@ struct type_casting_traits : vectorized_type_casting_traits struct type_casting_traits : vectorized_type_casting_traits {}; +EIGEN_STRONG_INLINE __mmask16 _eigen_mm512_cmpneq_ps_mask(__m512 a, __m512 b) { +#if EIGEN_COMP_GNUC && (EIGEN_COMP_CLANG < 1000 || EIGEN_COMP_GNUC < 810) + return _mm512_cmp_ps_mask(a, b, _CMP_NEQ_UQ); +#else + return _mm512_cmpneq_ps_mask(a, b); +#endif +} + template <> EIGEN_STRONG_INLINE Packet16b pcast(const Packet16f& a) { - __mmask16 mask = _mm512_cmpneq_ps_mask(a, pzero(a)); + __mmask16 mask = _eigen_mm512_cmpneq_ps_mask(a, pzero(a)); return _mm512_maskz_cvtepi32_epi8(mask, _mm512_set1_epi32(1)); }