From 8cdc0fa67d6c6d608283ad443973877412a2f109 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 5 Dec 2025 13:50:16 -0500 Subject: [PATCH 1/2] Fix naming of predux_half for RVV when LMUL > 1 libeigen/eigen!2087 --- Eigen/src/Core/arch/RVV10/PacketMath2.h | 20 ++++++++++---------- Eigen/src/Core/arch/RVV10/PacketMathFP16.h | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/arch/RVV10/PacketMath2.h b/Eigen/src/Core/arch/RVV10/PacketMath2.h index 1fda51131..e230ba16b 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath2.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath2.h @@ -266,7 +266,7 @@ template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, Packet2Xi>::type -predux_half_dowto4(const Packet4Xi& a) { +predux_half(const Packet4Xi& a) { return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), __riscv_vget_v_i32m4_i32m2(a, 1), unpacket_traits::size); } @@ -275,7 +275,7 @@ template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, Packet1Xi>::type -predux_half_dowto4(const Packet2Xi& a) { +predux_half(const Packet2Xi& a) { return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), unpacket_traits::size); } @@ -611,7 +611,7 @@ template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, Packet2Xf>::type -predux_half_dowto4(const Packet4Xf& a) { +predux_half(const Packet4Xf& a) { return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), __riscv_vget_v_f32m4_f32m2(a, 1), unpacket_traits::size); } @@ -620,7 +620,7 @@ template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, Packet1Xf>::type -predux_half_dowto4(const Packet2Xf& a) { +predux_half(const Packet2Xf& a) { return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), unpacket_traits::size); } @@ -876,7 +876,7 @@ template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, Packet2Xl>::type -predux_half_dowto4(const Packet4Xl& a) { +predux_half(const Packet4Xl& a) { return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), __riscv_vget_v_i64m4_i64m2(a, 1), unpacket_traits::size); } @@ -885,7 +885,7 @@ template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, Packet1Xl>::type -predux_half_dowto4(const Packet2Xl& a) { +predux_half(const Packet2Xl& a) { return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), unpacket_traits::size); } @@ -1222,7 +1222,7 @@ template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, Packet2Xd>::type -predux_half_dowto4(const Packet4Xd& a) { +predux_half(const Packet4Xd& a) { return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), __riscv_vget_v_f64m4_f64m2(a, 1), unpacket_traits::size); } @@ -1231,7 +1231,7 @@ template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, Packet1Xd>::type -predux_half_dowto4(const Packet2Xd& a) { +predux_half(const Packet2Xd& a) { return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), unpacket_traits::size); } @@ -1486,7 +1486,7 @@ template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, Packet2Xs>::type -predux_half_dowto4(const Packet4Xs& a) { +predux_half(const Packet4Xs& a) { return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), __riscv_vget_v_i16m4_i16m2(a, 1), unpacket_traits::size); } @@ -1495,7 +1495,7 @@ template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, Packet1Xs>::type -predux_half_dowto4(const Packet2Xs& a) { +predux_half(const Packet2Xs& a) { return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), unpacket_traits::size); } diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h index fbda19138..848e0ca0a 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h +++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h @@ -811,7 +811,7 @@ template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, PacketXh>::type -predux_half_dowto4(const Packet2Xh& a) { +predux_half(const Packet2Xh& a) { return __riscv_vfadd_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), unpacket_traits::size); } From 9b00db8cb9154477b93b342cf418b5da5d7f58a0 Mon Sep 17 00:00:00 2001 From: Gregory Meyer Date: Tue, 9 Dec 2025 10:36:45 -0800 Subject: [PATCH 2/2] Simplify thread-safe initialization of GpuDeviceProperties. libeigen/eigen!2089 --- .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h | 72 ++++++------------- 1 file changed, 22 insertions(+), 50 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h index 2a3b08732..a13aa7f72 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -44,68 +44,40 @@ class StreamInterface { class GpuDeviceProperties { public: - GpuDeviceProperties() : initialized_(false), first_(true), device_properties_(nullptr) {} + static const GpuDeviceProperties& instance() { + static const GpuDeviceProperties& kInstance = *new GpuDeviceProperties(); - ~GpuDeviceProperties() { - if (device_properties_) { - delete[] device_properties_; - } + return kInstance; } EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { return device_properties_[device]; } - EIGEN_STRONG_INLINE bool isInitialized() const { return initialized_; } + private: + GpuDeviceProperties() = default; - void initialize() { - if (!initialized_) { - // Attempts to ensure proper behavior in the case of multiple threads - // calling this function simultaneously. This would be trivial to - // implement if we could use std::mutex, but unfortunately mutex don't - // compile with nvcc, so we resort to atomics and thread fences instead. - // Note that if the caller uses a compiler that doesn't support c++11 we - // can't ensure that the initialization is thread safe. - if (first_.exchange(false)) { - // We're the first thread to reach this point. - int num_devices; - gpuError_t status = gpuGetDeviceCount(&num_devices); - if (status != gpuSuccess) { - std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl; - gpu_assert(status == gpuSuccess); - } - device_properties_ = new gpuDeviceProp_t[num_devices]; - for (int i = 0; i < num_devices; ++i) { - status = gpuGetDeviceProperties(&device_properties_[i], i); - if (status != gpuSuccess) { - std::cerr << "Failed to initialize GPU device #" << i << ": " << gpuGetErrorString(status) << std::endl; - gpu_assert(status == gpuSuccess); - } - } - - std::atomic_thread_fence(std::memory_order_release); - initialized_ = true; - } else { - // Wait for the other thread to inititialize the properties. - while (!initialized_) { - std::atomic_thread_fence(std::memory_order_acquire); - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } + static std::vector GetDeviceProperties() { + int num_devices = 0; + gpuError_t status = gpuGetDeviceCount(&num_devices); + if (status != gpuSuccess) { + std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl; + gpu_assert(status == gpuSuccess); + } + std::vector device_properties(num_devices); + for (int i = 0; i < num_devices; ++i) { + status = gpuGetDeviceProperties(&device_properties_[i], i); + if (status != gpuSuccess) { + std::cerr << "Failed to initialize GPU device #" << i << ": " << gpuGetErrorString(status) << std::endl; + gpu_assert(status == gpuSuccess); } } + + return device_properties; } - private: - volatile bool initialized_; - std::atomic first_; - gpuDeviceProp_t* device_properties_; + std::vector device_properties_ = GetDeviceProperties(); }; -EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() { - static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties(); - if (!deviceProperties->isInitialized()) { - deviceProperties->initialize(); - } - return *deviceProperties; -} +EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() { return GpuDeviceProperties::instance(); } EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) { return GetGpuDeviceProperties().get(device);