diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index bf50697f6..50a25cf56 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -2496,38 +2496,60 @@ template <> EIGEN_STRONG_INLINE Packet4f ploadquad(const float* from) { return vld1q_dup_f32(from); } + +// WORKAROUND: Apple Clang 17.0.0 (and Homebrew Clang 21.1.8) at -O0 optimization +// generate incorrect code for vld1_dup_[su]8, ignoring the pointer offset. +// We use vdup_n_s8(*from) to force a safe scalar load before broadcast. +EIGEN_ALWAYS_INLINE int8x8_t eigen_vld1_dup_s8(const int8_t* ptr) { +#if EIGEN_COMP_CLANGAPPLE && EIGEN_ARCH_ARM64 + return vdup_n_s8(*ptr); +#else + return vld1_dup_s8(ptr); +#endif +} + +EIGEN_ALWAYS_INLINE uint8x8_t eigen_vld1_dup_u8(const uint8_t* ptr) { +#if EIGEN_COMP_CLANGAPPLE && EIGEN_ARCH_ARM64 + return vdup_n_u8(*ptr); +#else + return vld1_dup_u8(ptr); +#endif +} + template <> EIGEN_STRONG_INLINE Packet4c ploadquad(const int8_t* from) { - return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); + return vget_lane_s32(vreinterpret_s32_s8(eigen_vld1_dup_s8(from)), 0); } template <> EIGEN_STRONG_INLINE Packet8c ploadquad(const int8_t* from) { return vreinterpret_s8_u32( - vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]); + vzip_u32(vreinterpret_u32_s8(eigen_vld1_dup_s8(from)), vreinterpret_u32_s8(eigen_vld1_dup_s8(from + 1))).val[0]); } template <> EIGEN_STRONG_INLINE Packet16c ploadquad(const int8_t* from) { const int8x8_t a = vreinterpret_s8_u32( - vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]); + vzip_u32(vreinterpret_u32_s8(eigen_vld1_dup_s8(from)), vreinterpret_u32_s8(eigen_vld1_dup_s8(from + 1))).val[0]); const int8x8_t b = vreinterpret_s8_u32( - vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from + 2)), vreinterpret_u32_s8(vld1_dup_s8(from + 3))).val[0]); + vzip_u32(vreinterpret_u32_s8(eigen_vld1_dup_s8(from + 2)), vreinterpret_u32_s8(eigen_vld1_dup_s8(from + 3))) + .val[0]); return vcombine_s8(a, b); } template <> EIGEN_STRONG_INLINE Packet4uc ploadquad(const uint8_t* from) { - return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); + return vget_lane_u32(vreinterpret_u32_u8(eigen_vld1_dup_u8(from)), 0); } template <> EIGEN_STRONG_INLINE Packet8uc ploadquad(const uint8_t* from) { return vreinterpret_u8_u32( - vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]); + vzip_u32(vreinterpret_u32_u8(eigen_vld1_dup_u8(from)), vreinterpret_u32_u8(eigen_vld1_dup_u8(from + 1))).val[0]); } template <> EIGEN_STRONG_INLINE Packet16uc ploadquad(const uint8_t* from) { const uint8x8_t a = vreinterpret_u8_u32( - vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]); + vzip_u32(vreinterpret_u32_u8(eigen_vld1_dup_u8(from)), vreinterpret_u32_u8(eigen_vld1_dup_u8(from + 1))).val[0]); const uint8x8_t b = vreinterpret_u8_u32( - vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from + 2)), vreinterpret_u32_u8(vld1_dup_u8(from + 3))).val[0]); + vzip_u32(vreinterpret_u32_u8(eigen_vld1_dup_u8(from + 2)), vreinterpret_u32_u8(eigen_vld1_dup_u8(from + 3))) + .val[0]); return vcombine_u8(a, b); } template <> diff --git a/test/accelerate_support.cpp b/test/accelerate_support.cpp index 4620d41f3..ec0609aaf 100644 --- a/test/accelerate_support.cpp +++ b/test/accelerate_support.cpp @@ -1,11 +1,19 @@ -#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS -#include "sparse_solver.h" - #if defined(DEBUG) #undef DEBUG #endif +#define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS + +#pragma clang diagnostic push +// The following "warning" causes a build failure on macOS with the latest +// version of clang: +// error: non-defining declaration of enumeration with a fixed underlying +// type is only permitted as a standalone declaration +#pragma clang diagnostic ignored "-Welaborated-enum-base" #include +#pragma clang diagnostic pop + +#include "sparse_solver.h" template int generate_sparse_rectangular_problem(MatrixType& A, DenseMat& dA, int maxRows = 300, int maxCols = 300) { diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 390960a88..f021cc2e9 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -86,6 +86,13 @@ inline T REF_ABS_DIFF(const T& a, const T& b) { return a > b ? a - b : b - a; } +// MacOS apple-clang has an issue with pcmp_eq for half when inlined, +// resulting in an ICE, but only in this specific test. +template +EIGEN_DONT_INLINE Packet REF_PCMP_EQ(const Packet& a, const Packet& b) { + return internal::pcmp_eq(a, b); +} + // Specializations for bool. template <> inline bool REF_ADD(const bool& a, const bool& b) { @@ -361,21 +368,21 @@ void packetmath_boolean_mask_ops() { data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); } - CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq); + CHECK_CWISE2_MASK(REF_PCMP_EQ, internal::pcmp_eq); // Test (-0) == (0) for signed operations for (int i = 0; i < PacketSize; ++i) { data1[i] = Scalar(-0.0); data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); } - CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq); + CHECK_CWISE2_MASK(REF_PCMP_EQ, internal::pcmp_eq); // Test NaN for (int i = 0; i < PacketSize; ++i) { data1[i] = NumTraits::quiet_NaN(); data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); } - CHECK_CWISE2_MASK(internal::pcmp_eq, internal::pcmp_eq); + CHECK_CWISE2_MASK(REF_PCMP_EQ, internal::pcmp_eq); } template diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 5efa7e822..4fb5125d8 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -7,7 +7,7 @@ endif() set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported") add_custom_target(BuildUnsupported) -include_directories(../../test ../../unsupported ../../Eigen +include_directories(../../test ../../unsupported ${CMAKE_CURRENT_BINARY_DIR}/../../test) find_package (Threads)