GPU: Add sparse solvers, FFT, and SpMV (cuDSS, cuFFT, cuSPARSE)

Add GPU sparse direct solvers (Cholesky, LDL^T, LU) via cuDSS, 1D/2D FFT via cuFFT with plan caching, and sparse matrix-vector/matrix multiply (SpMV/SpMM) via cuSPARSE. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
GPU: Add dense cuSOLVER solvers (QR, SVD, EigenSolver)
2026-04-10 11:34:33 +08:00 · 2026-04-09 19:11:49 -07:00 · 2026-04-09 19:11:34 -07:00 · 2026-04-09 19:05:25 -07:00 · 2026-04-09 15:21:39 -07:00
76 changed files with 10763 additions and 724 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.10.0)
+cmake_minimum_required(VERSION 3.17)

 #==============================================================================
 # CMake Policy issues.
@@ -9,7 +9,7 @@ if (POLICY CMP0077)
 endif (POLICY CMP0077)

 # NOTE Remove setting the policy once the minimum required CMake version is
-# increased to at least 3.15. Retain enabling the export to package registry.
+# increased to at least 3.21. Retain enabling the export to package registry.
 if (POLICY CMP0090)
  # The export command does not populate package registry by default
  cmake_policy (SET CMP0090 NEW)
@@ -672,7 +672,7 @@ if (EIGEN_BUILD_TESTING)
  endif()

  set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.")
-  set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
+  set(EIGEN_CUDA_COMPUTE_ARCH 70 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")

  option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
  if(EIGEN_TEST_SYCL)
@@ -817,4 +817,3 @@ endif()
 message(STATUS "")
 message(STATUS "Configured Eigen ${EIGEN_VERSION_STRING}")
 message(STATUS "")
-
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -50,9 +50,9 @@
 #include "src/Core/util/AOCL_Support.h"


-#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
-#define EIGEN_HAS_GPU_FP16
-#endif
+// EIGEN_HAS_GPU_FP16 is now always true when compiling with CUDA or HIP.
+// Use EIGEN_GPUCC (compile-time) or EIGEN_GPU_COMPILE_PHASE (device phase) instead.
+// TODO: Remove EIGEN_HAS_GPU_BF16 similarly once HIP bf16 guards are cleaned up.

 #if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16)
 #define EIGEN_HAS_GPU_BF16
--- a/Eigen/GPU
+++ b/Eigen/GPU
@@ -0,0 +1,69 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GPU_MODULE_H
+#define EIGEN_GPU_MODULE_H
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup GPU_Module GPU module
+ *
+ * GPU-accelerated solvers and operations using NVIDIA CUDA libraries
+ * (cuSOLVER, cuBLAS, cuSPARSE, cuFFT, cuDSS).
+ *
+ * This module provides explicit GPU solver classes that coexist with Eigen's
+ * CPU solvers. Unlike the LAPACKE dispatch (which replaces the CPU
+ * implementation globally), GPU classes are separate types the user
+ * instantiates by choice:
+ *
+ * \code
+ * #define EIGEN_USE_GPU
+ * #include <Eigen/GPU>
+ *
+ * // CPU path (unchanged)
+ * Eigen::LLT<Eigen::MatrixXd> llt_cpu(A);
+ *
+ * // GPU path (explicit)
+ * Eigen::GpuLLT<double> llt_gpu(A);   // L stays on device
+ * auto X = llt_gpu.solve(B);          // only B transferred per solve
+ * \endcode
+ *
+ * Requires CUDA 11.4+. See CLAUDE.md.
+ */
+
+#ifdef EIGEN_USE_GPU
+// IWYU pragma: begin_exports
+#include "src/GPU/DeviceMatrix.h"
+#include "src/GPU/GpuContext.h"
+#include "src/GPU/DeviceExpr.h"
+#include "src/GPU/DeviceBlasExpr.h"
+#include "src/GPU/DeviceSolverExpr.h"
+#include "src/GPU/DeviceDispatch.h"
+#include "src/GPU/GpuLLT.h"
+#include "src/GPU/GpuLU.h"
+#include "src/GPU/GpuQR.h"
+#include "src/GPU/GpuSVD.h"
+#include "src/GPU/GpuEigenSolver.h"
+#include "src/GPU/CuFftSupport.h"
+#include "src/GPU/GpuFFT.h"
+#include "src/GPU/CuSparseSupport.h"
+#include "src/GPU/GpuSparseContext.h"
+#ifdef EIGEN_CUDSS
+#include "src/GPU/CuDssSupport.h"
+#include "src/GPU/GpuSparseSolverBase.h"
+#include "src/GPU/GpuSparseLLT.h"
+#include "src/GPU/GpuSparseLDLT.h"
+#include "src/GPU/GpuSparseLU.h"
+#endif
+// IWYU pragma: end_exports
+#endif
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif  // EIGEN_GPU_MODULE_H
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -858,16 +858,8 @@ struct hash<Eigen::bfloat16> {
 }  // namespace std
 #endif

-// Add the missing shfl* intrinsics.
-// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
-//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
-//
-// HIP and CUDA prior to SDK 9.0 define
-//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
-// CUDA since 9.0 deprecates those and instead defines
-//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
-//    with native support for __half and __nv_bfloat16
-//
+// Warp shuffle overloads for Eigen::bfloat16.
+// HIP uses non-sync __shfl variants; CUDA has native __nv_bfloat16 support in __shfl_sync.
 // Note that the following are __device__ - only functions.
 #if defined(EIGEN_HIPCC)

--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -45,7 +45,7 @@
 // Eigen with GPU support.
 // Any functions that require `numext::bit_cast` may also not be constexpr,
 // including any native types when setting via raw bit values.
-#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+#if defined(EIGEN_GPUCC) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
 #define _EIGEN_MAYBE_CONSTEXPR
 #else
 #define _EIGEN_MAYBE_CONSTEXPR constexpr
@@ -121,12 +121,12 @@ namespace half_impl {
 //
 // Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
 // this error, and hence the following convoluted #if condition
-#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)

 // Make our own __half_raw definition that is similar to CUDA's.
 struct __half_raw {
  struct construct_from_rep_tag {};
-#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
+#if (defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE))
  // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
  // The element type for shared memory cannot have non-trivial constructors
  // and hence the following special casing (which skips the zero-initilization).
@@ -152,16 +152,12 @@ struct __half_raw {
 #endif
 };

-#elif defined(EIGEN_HAS_HIP_FP16)
+#elif defined(EIGEN_HIPCC)
 // HIP GPU compile phase: nothing to do here.
 // HIP fp16 header file has a definition for __half_raw
-#elif defined(EIGEN_HAS_CUDA_FP16)
+#elif defined(EIGEN_CUDACC)

 // CUDA GPU compile phase.
-#if EIGEN_CUDA_SDK_VER < 90000
-// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
-typedef __half __half_raw;
-#endif  // defined(EIGEN_HAS_CUDA_FP16)

 #elif defined(SYCL_DEVICE_ONLY)
 typedef cl::sycl::half __half_raw;
@@ -175,15 +171,13 @@ struct half_base : public __half_raw {
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {}
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}

-#if defined(EIGEN_HAS_GPU_FP16)
-#if defined(EIGEN_HAS_HIP_FP16)
+#if defined(EIGEN_GPUCC)
+#if defined(EIGEN_HIPCC)
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
-#elif defined(EIGEN_HAS_CUDA_FP16)
-#if EIGEN_CUDA_SDK_VER >= 90000
+#elif defined(EIGEN_CUDACC)
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
 #endif
 #endif
-#endif
 };

 }  // namespace half_impl
@@ -192,36 +186,29 @@ struct half_base : public __half_raw {
 struct half : public half_impl::half_base {
  // Writing this out as separate #if-else blocks to make the code easier to follow
  // The same applies to most #if-else blocks in this file
-#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)
  // Use the same base class for the following two scenarios
  // * when compiling without GPU support enabled
  // * during host compile phase when compiling with GPU support enabled
  typedef half_impl::__half_raw __half_raw;
-#elif defined(EIGEN_HAS_HIP_FP16)
+#elif defined(EIGEN_HIPCC)
  // Nothing to do here
  // HIP fp16 header file has a definition for __half_raw
-#elif defined(EIGEN_HAS_CUDA_FP16)
-// Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
-// (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP!  So keeping this within
-// #if defined(EIGEN_HAS_CUDA_FP16) is needed
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-  typedef half_impl::__half_raw __half_raw;
-#endif
+#elif defined(EIGEN_CUDACC)
+  // Nothing to do here.
 #endif

  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {}

  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}

-#if defined(EIGEN_HAS_GPU_FP16)
-#if defined(EIGEN_HAS_HIP_FP16)
+#if defined(EIGEN_GPUCC)
+#if defined(EIGEN_HIPCC)
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
-#elif defined(EIGEN_HAS_CUDA_FP16)
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+#elif defined(EIGEN_CUDACC)
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
 #endif
 #endif
-#endif

 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
  explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b)
@@ -248,7 +235,7 @@ struct half : public half_impl::half_base {
    return half_impl::half_to_float(*this);
  }

-#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE)
  EIGEN_DEVICE_FUNC operator __half() const {
    ::__half_raw hr;
    hr.x = x;
@@ -380,8 +367,7 @@ namespace Eigen {

 namespace half_impl {

-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 // Note: We deliberately do *not* define this to 1 even if we have Arm's native
 // fp16 type since GPU half types are rather different from native CPU half types.
 #define EIGEN_HAS_NATIVE_GPU_FP16
@@ -393,24 +379,10 @@ namespace half_impl {
 // conversion steps back and forth.

 #if defined(EIGEN_HAS_NATIVE_GPU_FP16)
-EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  return __hadd(::__half(a), ::__half(b));
-#else
-  return __hadd(a, b);
-#endif
-}
+EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) { return __hadd(::__half(a), ::__half(b)); }
 EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { return __hmul(a, b); }
 EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { return __hsub(a, b); }
-EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  return __hdiv(a, b);
-#else
-  float num = __half2float(a);
-  float denom = __half2float(b);
-  return __float2half(num / denom);
-#endif
-}
+EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) { return __hdiv(a, b); }
 EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { return __hneg(a); }
 EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
  a = a + b;
@@ -505,7 +477,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half&
 // We need to provide emulated *host-side* FP16 operators for clang.
 #pragma push_macro("EIGEN_DEVICE_FUNC")
 #undef EIGEN_DEVICE_FUNC
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
+#if defined(EIGEN_CUDACC) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
 #define EIGEN_DEVICE_FUNC __host__
 #else  // both host and device need emulated ops.
 #define EIGEN_DEVICE_FUNC __host__ __device__
@@ -636,7 +608,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint
  // because this is constexpr function.
  // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out
  // of this catch22 by having separate bodies for GPU / non GPU
-#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_GPUCC)
  __half_raw h;
  h.x = x;
  return h;
@@ -661,8 +633,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const
 }

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  __half tmp_ff = __float2half(ff);
  return *(__half_raw*)&tmp_ff;

@@ -735,8 +706,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
 }

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return __half2float(h);
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
  return static_cast<float>(h.x);
@@ -778,8 +748,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return __hisnan(a);
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
  return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
@@ -810,16 +779,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hexp(a));
 #else
  return half(::expf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hexp2(a));
 #else
  return half(::exp2f(float(a)));
@@ -827,9 +794,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
-     EIGEN_CUDA_ARCH >= 530) ||                                                                 \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return half(hlog(a));
 #else
  return half(::logf(float(a)));
@@ -842,8 +807,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) {
 }

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hsqrt(a));
 #else
  return half(::sqrtf(float(a)));
@@ -864,16 +828,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { return half(::a
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) { return half(::atanf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) { return half(::atanhf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hfloor(a));
 #else
  return half(::floorf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hceil(a));
 #else
  return half(::ceilf(float(a)));
@@ -1007,20 +969,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half madd<Eigen::half>(const Eigen:
 }  // namespace numext
 }  // namespace Eigen

-// Add the missing shfl* intrinsics.
-// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
-//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
-//
-// HIP and CUDA prior to SDK 9.0 define
-//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
-// CUDA since 9.0 deprecates those and instead defines
-//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
-//    with native support for __half and __nv_bfloat16
-//
+// Warp shuffle overloads for Eigen::half.
+// CUDA uses __shfl_*_sync (with mask); HIP uses __shfl_* (no mask).
 // Note that the following are __device__ - only functions.
-#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) || defined(EIGEN_HIPCC)
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)

-#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
+#if defined(EIGEN_CUDACC)

 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
                                                       int width = warpSize) {
@@ -1046,7 +1000,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen:
  return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
 }

-#else  // HIP or CUDA SDK < 9.0
+#else  // HIP

 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
@@ -1072,7 +1026,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
 #endif  // __shfl*

 // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) || defined(EIGEN_HIPCC)
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
 EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
  return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
 }
@@ -1095,8 +1049,7 @@ namespace internal {
 template <>
 struct cast_impl<float, half> {
  EIGEN_DEVICE_FUNC static inline half run(const float& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
    return __float2half(a);
 #else
    return half(a);
@@ -1107,8 +1060,7 @@ struct cast_impl<float, half> {
 template <>
 struct cast_impl<int, half> {
  EIGEN_DEVICE_FUNC static inline half run(const int& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
    return __float2half(static_cast<float>(a));
 #else
    return half(static_cast<float>(a));
@@ -1119,8 +1071,7 @@ struct cast_impl<int, half> {
 template <>
 struct cast_impl<half, float> {
  EIGEN_DEVICE_FUNC static inline float run(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
    return __half2float(a);
 #else
    return static_cast<float>(a);
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -17,19 +17,8 @@ namespace Eigen {

 namespace internal {

-// Read-only data cached load available.
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
-#define EIGEN_GPU_HAS_LDG 1
-#endif
-
-// FP16 math available.
-#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
-#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
-#endif
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
-#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
-#endif
+// Read-only data cached load (__ldg) and native FP16 arithmetic are available
+// on all supported GPU architectures (sm_70+ for CUDA, GFX906+ for HIP).

 // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
@@ -56,92 +45,84 @@ struct is_arithmetic<double2> {

 template <>
 struct packet_traits<float> : default_packet_traits {
-  typedef float4 type;
-  typedef float4 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 4,
+  using type = float4;
+  using half = float4;
+  static constexpr int Vectorizable = 1;
+  static constexpr int AlignedOnScalar = 1;
+  static constexpr int size = 4;

-    HasDiv = 1,
-    HasSin = 0,
-    HasCos = 0,
-    HasLog = 1,
-    HasExp = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasLGamma = 1,
-    HasDiGamma = 1,
-    HasZeta = 1,
-    HasPolygamma = 1,
-    HasErf = 1,
-    HasErfc = 1,
-    HasNdtri = 1,
-    HasBessel = 1,
-    HasIGamma = 1,
-    HasIGammaDerA = 1,
-    HasGammaSampleDerAlpha = 1,
-    HasIGammac = 1,
-    HasBetaInc = 1,
+  static constexpr int HasDiv = 1;
+  static constexpr int HasSin = 0;
+  static constexpr int HasCos = 0;
+  static constexpr int HasLog = 1;
+  static constexpr int HasExp = 1;
+  static constexpr int HasSqrt = 1;
+  static constexpr int HasRsqrt = 1;
+  static constexpr int HasLGamma = 1;
+  static constexpr int HasDiGamma = 1;
+  static constexpr int HasZeta = 1;
+  static constexpr int HasPolygamma = 1;
+  static constexpr int HasErf = 1;
+  static constexpr int HasErfc = 1;
+  static constexpr int HasNdtri = 1;
+  static constexpr int HasBessel = 1;
+  static constexpr int HasIGamma = 1;
+  static constexpr int HasIGammaDerA = 1;
+  static constexpr int HasGammaSampleDerAlpha = 1;
+  static constexpr int HasIGammac = 1;
+  static constexpr int HasBetaInc = 1;

-    HasFloor = 1,
-    HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
-  };
+  static constexpr int HasFloor = 1;
+  static constexpr int HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS;
 };

 template <>
 struct packet_traits<double> : default_packet_traits {
-  typedef double2 type;
-  typedef double2 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 2,
+  using type = double2;
+  using half = double2;
+  static constexpr int Vectorizable = 1;
+  static constexpr int AlignedOnScalar = 1;
+  static constexpr int size = 2;

-    HasDiv = 1,
-    HasLog = 1,
-    HasExp = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasLGamma = 1,
-    HasDiGamma = 1,
-    HasZeta = 1,
-    HasPolygamma = 1,
-    HasErf = 1,
-    HasErfc = 1,
-    HasNdtri = 1,
-    HasBessel = 1,
-    HasIGamma = 1,
-    HasIGammaDerA = 1,
-    HasGammaSampleDerAlpha = 1,
-    HasIGammac = 1,
-    HasBetaInc = 1,
-  };
+  static constexpr int HasDiv = 1;
+  static constexpr int HasLog = 1;
+  static constexpr int HasExp = 1;
+  static constexpr int HasSqrt = 1;
+  static constexpr int HasRsqrt = 1;
+  static constexpr int HasLGamma = 1;
+  static constexpr int HasDiGamma = 1;
+  static constexpr int HasZeta = 1;
+  static constexpr int HasPolygamma = 1;
+  static constexpr int HasErf = 1;
+  static constexpr int HasErfc = 1;
+  static constexpr int HasNdtri = 1;
+  static constexpr int HasBessel = 1;
+  static constexpr int HasIGamma = 1;
+  static constexpr int HasIGammaDerA = 1;
+  static constexpr int HasGammaSampleDerAlpha = 1;
+  static constexpr int HasIGammac = 1;
+  static constexpr int HasBetaInc = 1;
 };

 template <>
 struct unpacket_traits<float4> {
-  typedef float type;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef float4 half;
+  using type = float;
+  static constexpr int size = 4;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = float4;
 };
 template <>
 struct unpacket_traits<double2> {
-  typedef double type;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef double2 half;
+  using type = double;
+  static constexpr int size = 2;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = double2;
 };

 template <>
@@ -403,7 +384,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const dou

 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return __ldg(reinterpret_cast<const float4*>(from));
 #else
  return make_float4(from[0], from[1], from[2], from[3]);
@@ -411,7 +392,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return __ldg(reinterpret_cast<const double2*>(from));
 #else
  return make_double2(from[0], from[1]);
@@ -420,7 +401,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const

 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
 #else
  return make_float4(from[0], from[1], from[2], from[3]);
@@ -428,7 +409,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return make_double2(__ldg(from + 0), __ldg(from + 1));
 #else
  return make_double2(from[0], from[1]);
@@ -591,23 +572,20 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {

 #endif  // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)

-// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
-// on device. There is no benefit to using them on the host anyways, since they are
-// emulated.
-#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
+// Half-packet functions are only available in GPU device compilation — they use
+// intrinsics (__half2, etc.) that have no host-side benefit.
+#if defined(EIGEN_GPU_COMPILE_PHASE)

-typedef ulonglong2 Packet4h2;
+using Packet4h2 = ulonglong2;
 template <>
 struct unpacket_traits<Packet4h2> {
-  typedef Eigen::half type;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef Packet4h2 half;
+  using type = Eigen::half;
+  static constexpr int size = 8;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = Packet4h2;
 };
 template <>
 struct is_arithmetic<Packet4h2> {
@@ -616,15 +594,13 @@ struct is_arithmetic<Packet4h2> {

 template <>
 struct unpacket_traits<half2> {
-  typedef Eigen::half type;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef half2 half;
+  using type = Eigen::half;
+  static constexpr int size = 2;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = half2;
 };
 template <>
 struct is_arithmetic<half2> {
@@ -633,23 +609,21 @@ struct is_arithmetic<half2> {

 template <>
 struct packet_traits<Eigen::half> : default_packet_traits {
-  typedef Packet4h2 type;
-  typedef Packet4h2 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-    HasAdd = 1,
-    HasSub = 1,
-    HasMul = 1,
-    HasDiv = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasExp = 1,
-    HasExpm1 = 1,
-    HasLog = 1,
-    HasLog1p = 1
-  };
+  using type = Packet4h2;
+  using half = Packet4h2;
+  static constexpr int Vectorizable = 1;
+  static constexpr int AlignedOnScalar = 1;
+  static constexpr int size = 8;
+  static constexpr int HasAdd = 1;
+  static constexpr int HasSub = 1;
+  static constexpr int HasMul = 1;
+  static constexpr int HasDiv = 1;
+  static constexpr int HasSqrt = 1;
+  static constexpr int HasRsqrt = 1;
+  static constexpr int HasExp = 1;
+  static constexpr int HasExpm1 = 1;
+  static constexpr int HasLog = 1;
+  static constexpr int HasLog1p = 1;
 };

 template <>
@@ -690,7 +664,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2&
 }

 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  // Input is guaranteed to be properly aligned.
  return __ldg(reinterpret_cast<const half2*>(from));
 #else
@@ -699,7 +673,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half*
 }

 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  return __halves2half2(__ldg(from + 0), __ldg(from + 1));
 #else
  return __halves2half2(*(from + 0), *(from + 1));
@@ -745,12 +719,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& ker
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
-#else
-  float f = __half2float(a) + 1.0f;
-  return __halves2half2(a, __float2half(f));
-#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
@@ -837,89 +806,21 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2&
  return __halves2half2(result1, result2);
 }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hadd2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 + b1;
-  float r2 = a2 + b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { return __hadd2(a, b); }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hsub2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 - b1;
-  float r2 = a2 - b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { return __hsub2(a, b); }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hneg2(a);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return __floats2half2_rn(-a1, -a2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { return __hneg2(a); }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hmul2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 * b1;
-  float r2 = a2 * b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { return __hmul2(a, b); }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __hfma2(a, b, c);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float c1 = __low2float(c);
-  float c2 = __high2float(c);
-  float r1 = a1 * b1 + c1;
-  float r2 = a2 * b2 + c2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __h2div(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 / b1;
-  float r2 = a2 / b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { return __h2div(a, b); }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
  float a1 = __low2float(a);
@@ -942,47 +843,23 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b)
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __hadd(__low2half(a), __high2half(a));
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return Eigen::half(__float2half(a1 + a2));
-#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  __half first = __low2half(a);
  __half second = __high2half(a);
  return __hgt(first, second) ? first : second;
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return a1 > a2 ? __low2half(a) : __high2half(a);
-#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  __half first = __low2half(a);
  __half second = __high2half(a);
  return __hlt(first, second) ? first : second;
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return a1 < a2 ? __low2half(a) : __high2half(a);
-#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __hmul(__low2half(a), __high2half(a));
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return Eigen::half(__float2half(a1 * a2));
-#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
@@ -1001,8 +878,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }

-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
-
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
@@ -1010,41 +885,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
-
-#else
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = logf(a1);
-  float r2 = logf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = expf(a1);
-  float r2 = expf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = sqrtf(a1);
-  float r2 = sqrtf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = rsqrtf(a1);
-  float r2 = rsqrtf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-#endif
 }  // namespace

 template <>
@@ -1091,19 +931,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to,

 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
  Packet4h2 r;
+#if defined(EIGEN_GPU_COMPILE_PHASE)
  r = __ldg(reinterpret_cast<const Packet4h2*>(from));
-  return r;
 #else
-  Packet4h2 r;
  half2* r_alias = reinterpret_cast<half2*>(&r);
  r_alias[0] = ploadt_ro_aligned(from + 0);
  r_alias[1] = ploadt_ro_aligned(from + 2);
  r_alias[2] = ploadt_ro_aligned(from + 4);
  r_alias[3] = ploadt_ro_aligned(from + 6);
-  return r;
 #endif
+  return r;
 }

 template <>
@@ -1272,7 +1110,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::ha
  p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
  p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
  return r;
-#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#elif defined(EIGEN_CUDA_ARCH)
  Packet4h2 r;
  half2* r_alias = reinterpret_cast<half2*>(&r);

@@ -1290,16 +1128,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::ha
  r_alias[3] = plset(__high2half(c));

  return r;
-
-#else
-  float f = __half2float(a);
-  Packet4h2 r;
-  half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
-  p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
-  p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
-  p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
-  return r;
 #endif
 }

@@ -1533,7 +1361,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Pa
  half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
  __half first = predux_max(m0);
  __half second = predux_max(m1);
-#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#if defined(EIGEN_CUDA_ARCH)
  return (__hgt(first, second) ? first : second);
 #else
  float ffirst = __half2float(first);
@@ -1549,7 +1377,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Pa
  half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
  __half first = predux_min(m0);
  __half second = predux_min(m1);
-#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#if defined(EIGEN_CUDA_ARCH)
  return (__hlt(first, second) ? first : second);
 #else
  float ffirst = __half2float(first);
@@ -1641,47 +1469,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h
 // the implementation of GPU half reduction.
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __hadd2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 + b1;
-  float r2 = a2 + b2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }

 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __hmul2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 * b1;
-  float r2 = a2 * b2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }

 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __h2div(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 / b1;
-  float r2 = a2 / b2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }

 template <>
@@ -1706,11 +1504,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const ha
  return __halves2half2(r1, r2);
 }

-#endif  // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
-
-#undef EIGEN_GPU_HAS_LDG
-#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
-#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
+#endif  // defined(EIGEN_GPU_COMPILE_PHASE)

 }  // end namespace internal

--- a/Eigen/src/Core/arch/GPU/TypeCasting.h
+++ b/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -17,8 +17,7 @@ namespace Eigen {

 namespace internal {

-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)

 template <>
 struct type_casting_traits<Eigen::half, float> {
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -541,12 +541,6 @@ extern "C" {
 #if defined EIGEN_CUDACC
 #define EIGEN_VECTORIZE_GPU
 #include <vector_types.h>
-#if EIGEN_CUDA_SDK_VER >= 70500
-#define EIGEN_HAS_CUDA_FP16
-#endif
-#endif
-
-#if defined(EIGEN_HAS_CUDA_FP16)
 #include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
 #endif
@@ -554,7 +548,6 @@ extern "C" {
 #if defined(EIGEN_HIPCC)
 #define EIGEN_VECTORIZE_GPU
 #include <hip/hip_vector_types.h>
-#define EIGEN_HAS_HIP_FP16
 #include <hip/hip_fp16.h>
 #define EIGEN_HAS_HIP_BF16
 #include <hip/hip_bfloat16.h>
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -84,8 +84,7 @@
 #endif

 #if defined __NVCC__ && defined __CUDACC__
-// MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so
-// we instead use Microsoft's __pragma extension.
+// MSVC does not support the _Pragma keyword, so we use Microsoft's __pragma extension.
 #if defined _MSC_VER
 #define EIGEN_MAKE_PRAGMA(X) __pragma(#X)
 #else
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -148,13 +148,8 @@
 #endif

 #if defined(__NVCC__)
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+// CUDA 11.4+ always defines __CUDACC_VER_MAJOR__.
 #define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
-#elif defined(__CUDACC_VER__)
-#define EIGEN_COMP_NVCC __CUDACC_VER__
-#else
-#error "NVCC did not define compiler version."
-#endif
 #else
 #define EIGEN_COMP_NVCC 0
 #endif
@@ -575,6 +570,10 @@
 #define EIGEN_CUDA_SDK_VER 0
 #endif

+#if defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER > 0 && EIGEN_CUDA_SDK_VER < 110400
+#error "Eigen requires CUDA 11.4 or later."
+#endif
+
 #if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) && !defined(__SYCL_DEVICE_ONLY__)
 // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
 #define EIGEN_HIPCC __HIPCC__
@@ -584,22 +583,20 @@
 // ++ host_defines.h which contains the defines for the __host__ and __device__ macros
 #include <hip/hip_runtime.h>

+// Eigen requires ROCm/HIP >= 5.6 (GFX906 minimum architecture).
+// This floor exists to allow simplifying shared CUDA/HIP preprocessor guards —
+// all __HIP_ARCH_HAS_WARP_SHUFFLE__, __HIP_ARCH_HAS_FP16__, etc. are always true on GFX906+.
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 5 || (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 6))
+#error "Eigen requires ROCm/HIP >= 5.6."
+#endif
+
 #if defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
 // analogous to EIGEN_CUDA_ARCH, but for HIP
 #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
 #endif

-// For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
-// value to 1024. The compiler assigns a default value of 256 when the attribute is not
-// specified. This results in failures on the HIP platform, for cases when a GPU kernel
-// without an explicit launch_bounds attribute is called with a threads_per_block value
-// greater than 256.
-//
-// This is a regression in functionality and is expected to be fixed within the next
-// couple of ROCm releases (compiler will go back to using 1024 value as the default)
-//
-// In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
-// attribute.
+// HIP compilers default to launch_bounds(256), which causes failures when kernels
+// are called with more than 256 threads per block. Explicitly set to 1024 for HIP.

 #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)

--- a/Eigen/src/GPU/CuBlasSupport.h
+++ b/Eigen/src/GPU/CuBlasSupport.h
@@ -0,0 +1,233 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// cuBLAS-specific support types:
+//   - Error-checking macro
+//   - Operation enum and mapping to cublasOperation_t
+//
+// Generic CUDA runtime utilities (DeviceBuffer, cuda_data_type) are in GpuSupport.h.
+
+#ifndef EIGEN_GPU_CUBLAS_SUPPORT_H
+#define EIGEN_GPU_CUBLAS_SUPPORT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSupport.h"
+#include <cublas_v2.h>
+
+namespace Eigen {
+namespace internal {
+
+// ---- Error-checking macro ---------------------------------------------------
+
+#define EIGEN_CUBLAS_CHECK(expr)                                       \
+  do {                                                                 \
+    cublasStatus_t _s = (expr);                                        \
+    eigen_assert(_s == CUBLAS_STATUS_SUCCESS && "cuBLAS call failed"); \
+  } while (0)
+
+// ---- Operation enum ---------------------------------------------------------
+// Maps transpose/adjoint flags to cublasOperation_t.
+
+enum class GpuOp { NoTrans, Trans, ConjTrans };
+
+constexpr cublasOperation_t to_cublas_op(GpuOp op) {
+  switch (op) {
+    case GpuOp::Trans:
+      return CUBLAS_OP_T;
+    case GpuOp::ConjTrans:
+      return CUBLAS_OP_C;
+    default:
+      return CUBLAS_OP_N;
+  }
+}
+
+// ---- Scalar → cublasComputeType_t -------------------------------------------
+// cublasGemmEx requires a compute type (separate from the data type).
+//
+// Precision policy:
+//   - Default: tensor core algorithms enabled (CUBLAS_GEMM_DEFAULT_TENSOR_OP).
+//     For double, cuBLAS may use Ozaki emulation on sm_80+ tensor cores.
+//   - EIGEN_CUDA_TF32: opt-in to TF32 for float (~2x faster, 10-bit mantissa).
+//   - EIGEN_NO_CUDA_TENSOR_OPS: disables all tensor core usage. Uses pedantic
+//     compute types and CUBLAS_GEMM_DEFAULT algorithm. For bit-exact reproducibility.
+
+template <typename Scalar>
+struct cuda_compute_type;
+
+template <>
+struct cuda_compute_type<float> {
+#if defined(EIGEN_NO_CUDA_TENSOR_OPS)
+  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_PEDANTIC;
+#elif defined(EIGEN_CUDA_TF32)
+  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_FAST_TF32;
+#else
+  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
+#endif
+};
+template <>
+struct cuda_compute_type<double> {
+#ifdef EIGEN_NO_CUDA_TENSOR_OPS
+  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F_PEDANTIC;
+#else
+  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
+#endif
+};
+template <>
+struct cuda_compute_type<std::complex<float>> {
+#if defined(EIGEN_NO_CUDA_TENSOR_OPS)
+  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_PEDANTIC;
+#elif defined(EIGEN_CUDA_TF32)
+  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_FAST_TF32;
+#else
+  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
+#endif
+};
+template <>
+struct cuda_compute_type<std::complex<double>> {
+#ifdef EIGEN_NO_CUDA_TENSOR_OPS
+  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F_PEDANTIC;
+#else
+  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
+#endif
+};
+// ---- GEMM algorithm hint ----------------------------------------------------
+
+constexpr cublasGemmAlgo_t cuda_gemm_algo() {
+#ifdef EIGEN_NO_CUDA_TENSOR_OPS
+  return CUBLAS_GEMM_DEFAULT;
+#else
+  return CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+#endif
+}
+
+// ---- Alpha/beta scalar type for cublasGemmEx --------------------------------
+// For standard types, alpha/beta match the scalar type.
+
+template <typename Scalar>
+struct cuda_gemm_scalar {
+  using type = Scalar;
+};
+
+// ---- Type-specific cuBLAS wrappers ------------------------------------------
+// cuBLAS uses separate functions per type (Strsm, Dtrsm, etc.).
+// These overloaded wrappers allow calling cublasXtrsm/cublasXsymm/cublasXsyrk
+// with any supported scalar type.
+
+// TRSM wrappers
+inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
+                                  cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha,
+                                  const float* A, int lda, float* B, int ldb) {
+  return cublasStrsm(h, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
+                                  cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha,
+                                  const double* A, int lda, double* B, int ldb) {
+  return cublasDtrsm(h, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
+                                  cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+                                  const std::complex<float>* alpha, const std::complex<float>* A, int lda,
+                                  std::complex<float>* B, int ldb) {
+  return cublasCtrsm(h, side, uplo, trans, diag, m, n, reinterpret_cast<const cuComplex*>(alpha),
+                     reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(B), ldb);
+}
+inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
+                                  cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+                                  const std::complex<double>* alpha, const std::complex<double>* A, int lda,
+                                  std::complex<double>* B, int ldb) {
+  return cublasZtrsm(h, side, uplo, trans, diag, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
+                     reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<cuDoubleComplex*>(B), ldb);
+}
+
+// SYMM wrappers (real → symm, complex → hemm)
+inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
+                                  const float* alpha, const float* A, int lda, const float* B, int ldb,
+                                  const float* beta, float* C, int ldc) {
+  return cublasSsymm(h, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
+                                  const double* alpha, const double* A, int lda, const double* B, int ldb,
+                                  const double* beta, double* C, int ldc) {
+  return cublasDsymm(h, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
+                                  const std::complex<float>* alpha, const std::complex<float>* A, int lda,
+                                  const std::complex<float>* B, int ldb, const std::complex<float>* beta,
+                                  std::complex<float>* C, int ldc) {
+  return cublasChemm(h, side, uplo, m, n, reinterpret_cast<const cuComplex*>(alpha),
+                     reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<const cuComplex*>(B), ldb,
+                     reinterpret_cast<const cuComplex*>(beta), reinterpret_cast<cuComplex*>(C), ldc);
+}
+inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
+                                  const std::complex<double>* alpha, const std::complex<double>* A, int lda,
+                                  const std::complex<double>* B, int ldb, const std::complex<double>* beta,
+                                  std::complex<double>* C, int ldc) {
+  return cublasZhemm(h, side, uplo, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
+                     reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<const cuDoubleComplex*>(B), ldb,
+                     reinterpret_cast<const cuDoubleComplex*>(beta), reinterpret_cast<cuDoubleComplex*>(C), ldc);
+}
+
+// SYRK wrappers (real → syrk, complex → herk)
+inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
+                                  const float* alpha, const float* A, int lda, const float* beta, float* C, int ldc) {
+  return cublasSsyrk(h, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
+                                  const double* alpha, const double* A, int lda, const double* beta, double* C,
+                                  int ldc) {
+  return cublasDsyrk(h, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
+                                  const float* alpha, const std::complex<float>* A, int lda, const float* beta,
+                                  std::complex<float>* C, int ldc) {
+  return cublasCherk(h, uplo, trans, n, k, alpha, reinterpret_cast<const cuComplex*>(A), lda, beta,
+                     reinterpret_cast<cuComplex*>(C), ldc);
+}
+inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
+                                  const double* alpha, const std::complex<double>* A, int lda, const double* beta,
+                                  std::complex<double>* C, int ldc) {
+  return cublasZherk(h, uplo, trans, n, k, alpha, reinterpret_cast<const cuDoubleComplex*>(A), lda, beta,
+                     reinterpret_cast<cuDoubleComplex*>(C), ldc);
+}
+
+// GEAM wrappers: C = alpha * op(A) + beta * op(B)
+// Covers transpose, scale, matrix add/subtract in one call.
+inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
+                                  const float* alpha, const float* A, int lda, const float* beta, const float* B,
+                                  int ldb, float* C, int ldc) {
+  return cublasSgeam(h, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
+                                  const double* alpha, const double* A, int lda, const double* beta, const double* B,
+                                  int ldb, double* C, int ldc) {
+  return cublasDgeam(h, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
+                                  const std::complex<float>* alpha, const std::complex<float>* A, int lda,
+                                  const std::complex<float>* beta, const std::complex<float>* B, int ldb,
+                                  std::complex<float>* C, int ldc) {
+  return cublasCgeam(h, transa, transb, m, n, reinterpret_cast<const cuComplex*>(alpha),
+                     reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<const cuComplex*>(beta),
+                     reinterpret_cast<const cuComplex*>(B), ldb, reinterpret_cast<cuComplex*>(C), ldc);
+}
+inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
+                                  const std::complex<double>* alpha, const std::complex<double>* A, int lda,
+                                  const std::complex<double>* beta, const std::complex<double>* B, int ldb,
+                                  std::complex<double>* C, int ldc) {
+  return cublasZgeam(h, transa, transb, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
+                     reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<const cuDoubleComplex*>(beta),
+                     reinterpret_cast<const cuDoubleComplex*>(B), ldb, reinterpret_cast<cuDoubleComplex*>(C), ldc);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_CUBLAS_SUPPORT_H
--- a/Eigen/src/GPU/CuDssSupport.h
+++ b/Eigen/src/GPU/CuDssSupport.h
@@ -0,0 +1,134 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// cuDSS support utilities: error checking macro, type mapping.
+//
+// cuDSS is NVIDIA's sparse direct solver library, supporting Cholesky (LL^T),
+// LDL^T, and LU factorization on GPU. It requires CUDA 12.0+ and is
+// distributed separately from the CUDA Toolkit.
+
+#ifndef EIGEN_GPU_CUDSS_SUPPORT_H
+#define EIGEN_GPU_CUDSS_SUPPORT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSupport.h"
+#include <cudss.h>
+
+namespace Eigen {
+namespace internal {
+
+// ---- Error checking ---------------------------------------------------------
+
+#define EIGEN_CUDSS_CHECK(x)                                              \
+  do {                                                                    \
+    cudssStatus_t _s = (x);                                               \
+    eigen_assert(_s == CUDSS_STATUS_SUCCESS && "cuDSS call failed: " #x); \
+    EIGEN_UNUSED_VARIABLE(_s);                                            \
+  } while (0)
+
+// ---- Scalar → cudssMatrixType_t for SPD/HPD ---------------------------------
+
+template <typename Scalar>
+struct cudss_spd_type;
+
+template <>
+struct cudss_spd_type<float> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
+};
+template <>
+struct cudss_spd_type<double> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
+};
+template <>
+struct cudss_spd_type<std::complex<float>> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
+};
+template <>
+struct cudss_spd_type<std::complex<double>> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
+};
+
+// ---- Scalar → cudssMatrixType_t for symmetric/Hermitian ---------------------
+
+template <typename Scalar>
+struct cudss_symmetric_type;
+
+template <>
+struct cudss_symmetric_type<float> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
+};
+template <>
+struct cudss_symmetric_type<double> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
+};
+template <>
+struct cudss_symmetric_type<std::complex<float>> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
+};
+template <>
+struct cudss_symmetric_type<std::complex<double>> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
+};
+
+// ---- StorageIndex → cudaDataType_t ------------------------------------------
+
+template <typename StorageIndex>
+struct cudss_index_type;
+
+template <>
+struct cudss_index_type<int> {
+  static constexpr cudaDataType_t value = CUDA_R_32I;
+};
+template <>
+struct cudss_index_type<int64_t> {
+  static constexpr cudaDataType_t value = CUDA_R_64I;
+};
+
+// ---- UpLo → cudssMatrixViewType_t -------------------------------------------
+// For symmetric matrices stored as CSC (ColMajor), cuDSS sees CSR of A^T.
+// Since A = A^T, the data is the same, but the triangle view must be swapped.
+
+template <int UpLo, int StorageOrder>
+struct cudss_view_type;
+
+// ColMajor (CSC) passed as CSR: lower ↔ upper swap.
+template <>
+struct cudss_view_type<Lower, ColMajor> {
+  static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
+};
+template <>
+struct cudss_view_type<Upper, ColMajor> {
+  static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
+};
+
+// RowMajor (CSR) passed directly: no swap needed.
+template <>
+struct cudss_view_type<Lower, RowMajor> {
+  static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
+};
+template <>
+struct cudss_view_type<Upper, RowMajor> {
+  static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
+};
+
+}  // namespace internal
+
+// ---- Ordering enum ----------------------------------------------------------
+
+enum class GpuSparseOrdering {
+  AMD,    // Default fill-reducing ordering
+  METIS,  // METIS nested dissection
+  RCM     // Reverse Cuthill-McKee
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_CUDSS_SUPPORT_H
--- a/Eigen/src/GPU/CuFftSupport.h
+++ b/Eigen/src/GPU/CuFftSupport.h
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// cuFFT support utilities: error checking macro, type mapping.
+
+#ifndef EIGEN_GPU_CUFFT_SUPPORT_H
+#define EIGEN_GPU_CUFFT_SUPPORT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSupport.h"
+#include <cufft.h>
+
+namespace Eigen {
+namespace internal {
+
+// ---- Error checking ---------------------------------------------------------
+
+#define EIGEN_CUFFT_CHECK(x)                                       \
+  do {                                                             \
+    cufftResult _r = (x);                                          \
+    eigen_assert(_r == CUFFT_SUCCESS && "cuFFT call failed: " #x); \
+    EIGEN_UNUSED_VARIABLE(_r);                                     \
+  } while (0)
+
+// ---- Scalar → cufftType traits ----------------------------------------------
+
+template <typename Scalar>
+struct cufft_c2c_type;
+
+template <>
+struct cufft_c2c_type<float> {
+  static constexpr cufftType value = CUFFT_C2C;
+};
+template <>
+struct cufft_c2c_type<double> {
+  static constexpr cufftType value = CUFFT_Z2Z;
+};
+
+template <typename Scalar>
+struct cufft_r2c_type;
+
+template <>
+struct cufft_r2c_type<float> {
+  static constexpr cufftType value = CUFFT_R2C;
+};
+template <>
+struct cufft_r2c_type<double> {
+  static constexpr cufftType value = CUFFT_D2Z;
+};
+
+template <typename Scalar>
+struct cufft_c2r_type;
+
+template <>
+struct cufft_c2r_type<float> {
+  static constexpr cufftType value = CUFFT_C2R;
+};
+template <>
+struct cufft_c2r_type<double> {
+  static constexpr cufftType value = CUFFT_Z2D;
+};
+
+// ---- Type-dispatched cuFFT execution ----------------------------------------
+
+// C2C
+inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<float>* in, std::complex<float>* out,
+                                         int direction) {
+  return cufftExecC2C(plan, reinterpret_cast<cufftComplex*>(in), reinterpret_cast<cufftComplex*>(out), direction);
+}
+inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<double>* in, std::complex<double>* out,
+                                         int direction) {
+  return cufftExecZ2Z(plan, reinterpret_cast<cufftDoubleComplex*>(in), reinterpret_cast<cufftDoubleComplex*>(out),
+                      direction);
+}
+
+// R2C
+inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, float* in, std::complex<float>* out) {
+  return cufftExecR2C(plan, in, reinterpret_cast<cufftComplex*>(out));
+}
+inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, double* in, std::complex<double>* out) {
+  return cufftExecD2Z(plan, in, reinterpret_cast<cufftDoubleComplex*>(out));
+}
+
+// C2R
+inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<float>* in, float* out) {
+  return cufftExecC2R(plan, reinterpret_cast<cufftComplex*>(in), out);
+}
+inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<double>* in, double* out) {
+  return cufftExecZ2D(plan, reinterpret_cast<cufftDoubleComplex*>(in), out);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_CUFFT_SUPPORT_H
--- a/Eigen/src/GPU/CuSolverSupport.h
+++ b/Eigen/src/GPU/CuSolverSupport.h
@@ -0,0 +1,159 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// cuSOLVER-specific support types:
+//   - cuSOLVER error-checking macro
+//   - RAII wrapper for cusolverDnParams
+//   - Scalar → cudaDataType_t mapping
+//   - (UpLo, StorageOrder) → cublasFillMode_t mapping
+//
+// Generic CUDA runtime utilities (DeviceBuffer, EIGEN_CUDA_RUNTIME_CHECK)
+// are in GpuSupport.h.
+
+#ifndef EIGEN_GPU_CUSOLVER_SUPPORT_H
+#define EIGEN_GPU_CUSOLVER_SUPPORT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSupport.h"
+#include <cusolverDn.h>
+
+namespace Eigen {
+namespace internal {
+
+// ---- Error-checking macros --------------------------------------------------
+
+#define EIGEN_CUSOLVER_CHECK(expr)                                         \
+  do {                                                                     \
+    cusolverStatus_t _s = (expr);                                          \
+    eigen_assert(_s == CUSOLVER_STATUS_SUCCESS && "cuSOLVER call failed"); \
+  } while (0)
+
+// ---- RAII: cusolverDnParams -------------------------------------------------
+
+struct CusolverParams {
+  cusolverDnParams_t p = nullptr;
+
+  CusolverParams() { EIGEN_CUSOLVER_CHECK(cusolverDnCreateParams(&p)); }
+
+  ~CusolverParams() {
+    if (p) (void)cusolverDnDestroyParams(p);  // destructor: can't propagate
+  }
+
+  // Move-only.
+  CusolverParams(CusolverParams&& o) noexcept : p(o.p) { o.p = nullptr; }
+  CusolverParams& operator=(CusolverParams&& o) noexcept {
+    if (this != &o) {
+      if (p) (void)cusolverDnDestroyParams(p);
+      p = o.p;
+      o.p = nullptr;
+    }
+    return *this;
+  }
+
+  CusolverParams(const CusolverParams&) = delete;
+  CusolverParams& operator=(const CusolverParams&) = delete;
+};
+
+// ---- Scalar → cudaDataType_t ------------------------------------------------
+// Alias for backward compatibility. The canonical trait is cuda_data_type<> in GpuSupport.h.
+template <typename Scalar>
+using cusolver_data_type = cuda_data_type<Scalar>;
+
+// ---- (UpLo, StorageOrder) → cublasFillMode_t --------------------------------
+// cuSOLVER always interprets the matrix as column-major. A row-major matrix A
+// appears as A^T to cuSOLVER, so the upper/lower triangle is swapped.
+
+template <int UpLo, int StorageOrder>
+struct cusolver_fill_mode;
+
+template <>
+struct cusolver_fill_mode<Lower, ColMajor> {
+  static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
+};
+template <>
+struct cusolver_fill_mode<Upper, ColMajor> {
+  static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_UPPER;
+};
+template <>
+struct cusolver_fill_mode<Lower, RowMajor> {
+  static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_UPPER;
+};
+template <>
+struct cusolver_fill_mode<Upper, RowMajor> {
+  static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
+};
+
+// ---- Type-specific cuSOLVER wrappers ----------------------------------------
+// cuSOLVER does not provide generic X variants for ormqr/unmqr. These overloaded
+// wrappers dispatch to the correct type-specific function.
+// For real types: ormqr (orthogonal Q). For complex types: unmqr (unitary Q).
+
+inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
+                                         int n, int k, const float* A, int lda, const float* tau, float* C, int ldc,
+                                         float* work, int lwork, int* info) {
+  return cusolverDnSormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
+}
+inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
+                                         int n, int k, const double* A, int lda, const double* tau, double* C, int ldc,
+                                         double* work, int lwork, int* info) {
+  return cusolverDnDormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
+}
+inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
+                                         int n, int k, const std::complex<float>* A, int lda,
+                                         const std::complex<float>* tau, std::complex<float>* C, int ldc,
+                                         std::complex<float>* work, int lwork, int* info) {
+  return cusolverDnCunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
+                          reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<cuComplex*>(C), ldc,
+                          reinterpret_cast<cuComplex*>(work), lwork, info);
+}
+inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
+                                         int n, int k, const std::complex<double>* A, int lda,
+                                         const std::complex<double>* tau, std::complex<double>* C, int ldc,
+                                         std::complex<double>* work, int lwork, int* info) {
+  return cusolverDnZunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
+                          reinterpret_cast<const cuDoubleComplex*>(tau), reinterpret_cast<cuDoubleComplex*>(C), ldc,
+                          reinterpret_cast<cuDoubleComplex*>(work), lwork, info);
+}
+
+// Buffer size wrappers for ormqr/unmqr.
+inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
+                                                    cublasOperation_t trans, int m, int n, int k, const float* A,
+                                                    int lda, const float* tau, const float* C, int ldc, int* lwork) {
+  return cusolverDnSormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
+                                                    cublasOperation_t trans, int m, int n, int k, const double* A,
+                                                    int lda, const double* tau, const double* C, int ldc, int* lwork) {
+  return cusolverDnDormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
+                                                    cublasOperation_t trans, int m, int n, int k,
+                                                    const std::complex<float>* A, int lda,
+                                                    const std::complex<float>* tau, const std::complex<float>* C,
+                                                    int ldc, int* lwork) {
+  return cusolverDnCunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
+                                     reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<const cuComplex*>(C),
+                                     ldc, lwork);
+}
+inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
+                                                    cublasOperation_t trans, int m, int n, int k,
+                                                    const std::complex<double>* A, int lda,
+                                                    const std::complex<double>* tau, const std::complex<double>* C,
+                                                    int ldc, int* lwork) {
+  return cusolverDnZunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
+                                     reinterpret_cast<const cuDoubleComplex*>(tau),
+                                     reinterpret_cast<const cuDoubleComplex*>(C), ldc, lwork);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_CUSOLVER_SUPPORT_H
--- a/Eigen/src/GPU/CuSparseSupport.h
+++ b/Eigen/src/GPU/CuSparseSupport.h
@@ -0,0 +1,34 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// cuSPARSE support utilities: error checking macro.
+
+#ifndef EIGEN_GPU_CUSPARSE_SUPPORT_H
+#define EIGEN_GPU_CUSPARSE_SUPPORT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSupport.h"
+#include <cusparse.h>
+
+namespace Eigen {
+namespace internal {
+
+#define EIGEN_CUSPARSE_CHECK(x)                                                 \
+  do {                                                                          \
+    cusparseStatus_t _s = (x);                                                  \
+    eigen_assert(_s == CUSPARSE_STATUS_SUCCESS && "cuSPARSE call failed: " #x); \
+    EIGEN_UNUSED_VARIABLE(_s);                                                  \
+  } while (0)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_CUSPARSE_SUPPORT_H
--- a/Eigen/src/GPU/DeviceBlasExpr.h
+++ b/Eigen/src/GPU/DeviceBlasExpr.h
@@ -0,0 +1,146 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// BLAS Level 3 expression types for DeviceMatrix (beyond GEMM):
+//   TrsmExpr           → cublasXtrsm   (triangular solve)
+//   SymmExpr           → cublasXsymm   (symmetric multiply, real)
+//                      → cublasXhemm   (Hermitian multiply, complex)
+//   SyrkExpr           → cublasXsyrk   (symmetric rank-k update, real)
+//                      → cublasXherk   (Hermitian rank-k update, complex)
+
+#ifndef EIGEN_GPU_DEVICE_BLAS_EXPR_H
+#define EIGEN_GPU_DEVICE_BLAS_EXPR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename Scalar_>
+class DeviceMatrix;
+
+// ---- DeviceTriangularView ---------------------------------------------------
+// d_A.triangularView<Lower>() → view with .solve(d_B)
+
+template <typename Scalar_, int UpLo_>
+class DeviceTriangularView {
+ public:
+  using Scalar = Scalar_;
+  enum { UpLo = UpLo_ };
+
+  explicit DeviceTriangularView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
+  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
+
+  /** Build a TRSM solve expression. */
+  TrsmExpr<Scalar, UpLo_> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
+
+ private:
+  const DeviceMatrix<Scalar>& mat_;
+};
+
+// ---- TrsmExpr: triangularView<UpLo>().solve(B) → cublasXtrsm ---------------
+
+template <typename Scalar_, int UpLo_>
+class TrsmExpr {
+ public:
+  using Scalar = Scalar_;
+  enum { UpLo = UpLo_ };
+
+  TrsmExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
+  const DeviceMatrix<Scalar>& matrix() const { return A_; }
+  const DeviceMatrix<Scalar>& rhs() const { return B_; }
+
+ private:
+  const DeviceMatrix<Scalar>& A_;
+  const DeviceMatrix<Scalar>& B_;
+};
+
+// ---- DeviceSelfAdjointView --------------------------------------------------
+// d_A.selfadjointView<Lower>() → view that can multiply: view * d_B
+
+template <typename Scalar_, int UpLo_>
+class DeviceSelfAdjointView {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  enum { UpLo = UpLo_ };
+
+  explicit DeviceSelfAdjointView(DeviceMatrix<Scalar>& m) : mat_(m) {}
+  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
+  DeviceMatrix<Scalar>& matrix() { return mat_; }
+
+  /** Rank-k update: C.selfadjointView<Lower>().rankUpdate(A, alpha)
+   * computes C = alpha * A * A^H + C (lower triangle only).
+   * Maps to cublasXsyrk (real) or cublasXherk (complex). */
+  void rankUpdate(const DeviceMatrix<Scalar>& A, RealScalar alpha = RealScalar(1));
+
+ private:
+  DeviceMatrix<Scalar>& mat_;
+};
+
+// Const variant for multiplication only (no rankUpdate).
+template <typename Scalar_, int UpLo_>
+class ConstDeviceSelfAdjointView {
+ public:
+  using Scalar = Scalar_;
+  enum { UpLo = UpLo_ };
+
+  explicit ConstDeviceSelfAdjointView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
+  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
+
+ private:
+  const DeviceMatrix<Scalar>& mat_;
+};
+
+// ---- SymmExpr: selfadjointView<UpLo>() * B → cublasXsymm/Xhemm ------------
+
+template <typename Scalar_, int UpLo_>
+class SymmExpr {
+ public:
+  using Scalar = Scalar_;
+  enum { UpLo = UpLo_ };
+
+  SymmExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
+  const DeviceMatrix<Scalar>& matrix() const { return A_; }
+  const DeviceMatrix<Scalar>& rhs() const { return B_; }
+
+ private:
+  const DeviceMatrix<Scalar>& A_;
+  const DeviceMatrix<Scalar>& B_;
+};
+
+// operator*: DeviceSelfAdjointView * DeviceMatrix → SymmExpr (mutable and const variants)
+template <typename S, int UpLo>
+SymmExpr<S, UpLo> operator*(const DeviceSelfAdjointView<S, UpLo>& a, const DeviceMatrix<S>& b) {
+  return {a.matrix(), b};
+}
+template <typename S, int UpLo>
+SymmExpr<S, UpLo> operator*(const ConstDeviceSelfAdjointView<S, UpLo>& a, const DeviceMatrix<S>& b) {
+  return {a.matrix(), b};
+}
+
+// ---- SyrkExpr: rankUpdate(A) → cublasXsyrk/Xherk ---------------------------
+// C.rankUpdate(A) computes C += A * A^H (or A^H * A depending on convention).
+
+template <typename Scalar_, int UpLo_>
+class SyrkExpr {
+ public:
+  using Scalar = Scalar_;
+  enum { UpLo = UpLo_ };
+
+  SyrkExpr(const DeviceMatrix<Scalar>& A) : A_(A) {}
+  const DeviceMatrix<Scalar>& matrix() const { return A_; }
+
+ private:
+  const DeviceMatrix<Scalar>& A_;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_DEVICE_BLAS_EXPR_H
--- a/Eigen/src/GPU/DeviceDispatch.h
+++ b/Eigen/src/GPU/DeviceDispatch.h
@@ -0,0 +1,509 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Dispatch functions that map DeviceMatrix expressions to NVIDIA library calls.
+//
+// dispatch_gemm()  — GemmExpr → cublasXgemm
+//
+// Each function documents the exact library call and parameters.
+
+#ifndef EIGEN_GPU_DEVICE_DISPATCH_H
+#define EIGEN_GPU_DEVICE_DISPATCH_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./DeviceExpr.h"
+#include "./DeviceBlasExpr.h"
+#include "./DeviceSolverExpr.h"
+#include "./GpuContext.h"
+#include "./CuSolverSupport.h"
+
+namespace Eigen {
+namespace internal {
+
+// ---- GEMM dispatch ----------------------------------------------------------
+// GemmExpr<Lhs, Rhs> → cublasGemmEx(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
+//
+// The generic API cublasGemmEx handles all scalar types (float, double,
+// complex<float>, complex<double>) via cudaDataType_t.
+
+template <typename Lhs, typename Rhs>
+void dispatch_gemm(
+    GpuContext& ctx, DeviceMatrix<typename device_expr_traits<Lhs>::scalar_type>& dst, const GemmExpr<Lhs, Rhs>& expr,
+    typename device_expr_traits<Lhs>::scalar_type beta_val,
+    typename device_expr_traits<Lhs>::scalar_type alpha_scale = typename device_expr_traits<Lhs>::scalar_type(1)) {
+  using Scalar = typename device_expr_traits<Lhs>::scalar_type;
+  using traits_lhs = device_expr_traits<Lhs>;
+  using traits_rhs = device_expr_traits<Rhs>;
+
+  const DeviceMatrix<Scalar>& A = traits_lhs::matrix(expr.lhs());
+  const DeviceMatrix<Scalar>& B = traits_rhs::matrix(expr.rhs());
+
+  constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op);
+  constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op);
+
+  // GEMM dimensions: C(m,n) = op(A)(m,k) * op(B)(k,n)
+  // op(A) has dimensions (A.rows, A.cols) if NoTrans, (A.cols, A.rows) if Trans/ConjTrans.
+  const int64_t m = (traits_lhs::op == GpuOp::NoTrans) ? A.rows() : A.cols();
+  const int64_t k = (traits_lhs::op == GpuOp::NoTrans) ? A.cols() : A.rows();
+  const int64_t n = (traits_rhs::op == GpuOp::NoTrans) ? B.cols() : B.rows();
+  const int64_t rhs_k = (traits_rhs::op == GpuOp::NoTrans) ? B.rows() : B.cols();
+
+  eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch");
+
+  const int64_t lda = A.rows();
+  const int64_t ldb = B.rows();
+
+  // Serialize all accesses to the destination buffer on this stream.
+  if (!dst.empty()) {
+    dst.waitReady(ctx.stream());
+  }
+
+  // Allocate or resize destination.
+  const bool resized = dst.empty() || dst.rows() != m || dst.cols() != n;
+  if (resized) {
+    dst.resize(m, n);
+  }
+  const int64_t ldc = dst.rows();
+
+  // cuBLAS requires alpha/beta as float for half/bfloat16 inputs.
+  using GemmScalar = typename cuda_gemm_scalar<Scalar>::type;
+  GemmScalar alpha_gval =
+      static_cast<GemmScalar>(alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs()));
+  GemmScalar beta_gval = static_cast<GemmScalar>(beta_val);
+
+  // Wait for operands to be ready on this stream.
+  A.waitReady(ctx.stream());
+  B.waitReady(ctx.stream());
+
+  // If there is no existing valid destination to accumulate into, treat it as
+  // zero rather than reading uninitialized memory.
+  if (resized && beta_gval != GemmScalar(0) && dst.sizeInBytes() > 0) {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
+  }
+
+  constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
+  constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
+
+  EIGEN_CUBLAS_CHECK(cublasGemmEx(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n),
+                                  static_cast<int>(k), &alpha_gval, A.data(), dtype, static_cast<int>(lda), B.data(),
+                                  dtype, static_cast<int>(ldb), &beta_gval, dst.data(), dtype, static_cast<int>(ldc),
+                                  compute, cuda_gemm_algo()));
+
+  dst.recordReady(ctx.stream());
+}
+
+// ---- LLT solve dispatch -----------------------------------------------------
+// LltSolveExpr → cusolverDnXpotrf (factorize) + cusolverDnXpotrs (solve).
+// No caching — factor and workspace are temporary. Syncs to check info.
+
+template <typename Scalar, int UpLo>
+void dispatch_llt_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LltSolveExpr<Scalar, UpLo>& expr) {
+  const DeviceMatrix<Scalar>& A = expr.matrix();
+  const DeviceMatrix<Scalar>& B = expr.rhs();
+
+  eigen_assert(A.rows() == A.cols() && "LLT requires a square matrix");
+  eigen_assert(B.rows() == A.rows() && "LLT solve: RHS rows must match matrix size");
+
+  const Index n = A.rows();
+  const int64_t nrhs = static_cast<int64_t>(B.cols());
+
+  // Zero-size fast paths: no work, just resize dst.
+  // Wait on dst before resize to avoid freeing memory another stream is using.
+  if (n == 0 || nrhs == 0) {
+    if (!dst.empty()) dst.waitReady(ctx.stream());
+    dst.resize(n == 0 ? 0 : n, B.cols());
+    return;
+  }
+
+  A.waitReady(ctx.stream());
+  B.waitReady(ctx.stream());
+  if (!dst.empty()) dst.waitReady(ctx.stream());
+
+  constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
+  constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo, ColMajor>::value;
+  const int64_t lda = static_cast<int64_t>(A.rows());
+  const int64_t ldb = static_cast<int64_t>(B.rows());
+
+  const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
+  const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+
+  // D2D copy A → factor buffer (potrf is in-place).
+  DeviceBuffer d_factor(mat_bytes);
+  EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_factor.ptr, A.data(), mat_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
+
+  // Query workspace and factorize.
+  CusolverParams params;
+  DeviceBuffer d_factorize_info(sizeof(int));
+  size_t dev_ws = 0, host_ws = 0;
+  EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), dtype,
+                                                   d_factor.ptr, lda, dtype, &dev_ws, &host_ws));
+
+  DeviceBuffer d_workspace(dev_ws);
+  std::vector<char> h_workspace(host_ws);
+
+  EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(
+      ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), dtype, d_factor.ptr, lda, dtype, d_workspace.ptr,
+      dev_ws, host_ws > 0 ? h_workspace.data() : nullptr, host_ws, static_cast<int*>(d_factorize_info.ptr)));
+
+  // Check factorization info before proceeding to solve.
+  int factorize_info = 0;
+  EIGEN_CUDA_RUNTIME_CHECK(
+      cudaMemcpyAsync(&factorize_info, d_factorize_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
+  eigen_assert(factorize_info == 0 && "cuSOLVER LLT factorization failed (matrix not positive definite)");
+
+  // D2D copy B → dst (potrs is in-place on the RHS).
+  dst.resize(n, B.cols());
+  EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
+
+  // Solve.
+  DeviceBuffer d_solve_info(sizeof(int));
+  EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), nrhs, dtype,
+                                        d_factor.ptr, lda, dtype, dst.data(), static_cast<int64_t>(dst.rows()),
+                                        static_cast<int*>(d_solve_info.ptr)));
+
+  // Sync to ensure workspace locals can be freed safely.
+  int solve_info = 0;
+  EIGEN_CUDA_RUNTIME_CHECK(
+      cudaMemcpyAsync(&solve_info, d_solve_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
+  eigen_assert(solve_info == 0 && "cuSOLVER LLT solve failed");
+
+  dst.recordReady(ctx.stream());
+}
+
+// ---- LU solve dispatch ------------------------------------------------------
+// LuSolveExpr → cusolverDnXgetrf (factorize) + cusolverDnXgetrs (solve).
+
+template <typename Scalar>
+void dispatch_lu_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LuSolveExpr<Scalar>& expr) {
+  const DeviceMatrix<Scalar>& A = expr.matrix();
+  const DeviceMatrix<Scalar>& B = expr.rhs();
+
+  eigen_assert(A.rows() == A.cols() && "LU requires a square matrix");
+  eigen_assert(B.rows() == A.rows() && "LU solve: RHS rows must match matrix size");
+
+  const Index n = A.rows();
+  const int64_t nrhs = static_cast<int64_t>(B.cols());
+
+  if (n == 0 || nrhs == 0) {
+    if (!dst.empty()) dst.waitReady(ctx.stream());
+    dst.resize(n == 0 ? 0 : n, B.cols());
+    return;
+  }
+
+  A.waitReady(ctx.stream());
+  B.waitReady(ctx.stream());
+  if (!dst.empty()) dst.waitReady(ctx.stream());
+
+  constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
+  const int64_t lda = static_cast<int64_t>(A.rows());
+  const int64_t ldb = static_cast<int64_t>(B.rows());
+
+  const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
+  const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+  const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t);
+
+  // D2D copy A → LU buffer (getrf is in-place).
+  DeviceBuffer d_lu(mat_bytes);
+  EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu.ptr, A.data(), mat_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
+
+  DeviceBuffer d_ipiv(ipiv_bytes);
+
+  // Query workspace and factorize.
+  CusolverParams params;
+  DeviceBuffer d_factorize_info(sizeof(int));
+  size_t dev_ws = 0, host_ws = 0;
+  EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(ctx.cusolverHandle(), params.p, static_cast<int64_t>(n),
+                                                   static_cast<int64_t>(n), dtype, d_lu.ptr, lda, dtype, &dev_ws,
+                                                   &host_ws));
+
+  DeviceBuffer d_workspace(dev_ws);
+  std::vector<char> h_workspace(host_ws);
+
+  EIGEN_CUSOLVER_CHECK(
+      cusolverDnXgetrf(ctx.cusolverHandle(), params.p, static_cast<int64_t>(n), static_cast<int64_t>(n), dtype,
+                       d_lu.ptr, lda, static_cast<int64_t*>(d_ipiv.ptr), dtype, d_workspace.ptr, dev_ws,
+                       host_ws > 0 ? h_workspace.data() : nullptr, host_ws, static_cast<int*>(d_factorize_info.ptr)));
+
+  // Check factorization info before proceeding to solve.
+  int factorize_info = 0;
+  EIGEN_CUDA_RUNTIME_CHECK(
+      cudaMemcpyAsync(&factorize_info, d_factorize_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
+  eigen_assert(factorize_info == 0 && "cuSOLVER LU factorization failed (singular matrix)");
+
+  // D2D copy B → dst (getrs is in-place on the RHS).
+  dst.resize(n, B.cols());
+  EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
+
+  // Solve (NoTranspose).
+  DeviceBuffer d_solve_info(sizeof(int));
+  EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, static_cast<int64_t>(n), nrhs,
+                                        dtype, d_lu.ptr, lda, static_cast<const int64_t*>(d_ipiv.ptr), dtype,
+                                        dst.data(), static_cast<int64_t>(dst.rows()),
+                                        static_cast<int*>(d_solve_info.ptr)));
+
+  // Sync to ensure workspace locals can be freed safely.
+  int solve_info = 0;
+  EIGEN_CUDA_RUNTIME_CHECK(
+      cudaMemcpyAsync(&solve_info, d_solve_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
+  eigen_assert(solve_info == 0 && "cuSOLVER LU solve failed");
+
+  dst.recordReady(ctx.stream());
+}
+
+// ---- TRSM dispatch ----------------------------------------------------------
+// TrsmExpr → cublasXtrsm: solve op(A) * X = B where A is triangular.
+// Side=Left, Diag=NonUnit. A is square, B is n×nrhs.
+
+template <typename Scalar, int UpLo>
+void dispatch_trsm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const TrsmExpr<Scalar, UpLo>& expr) {
+  const DeviceMatrix<Scalar>& A = expr.matrix();
+  const DeviceMatrix<Scalar>& B = expr.rhs();
+
+  eigen_assert(A.rows() == A.cols() && "TRSM requires a square triangular matrix");
+  eigen_assert(B.rows() == A.rows() && "TRSM: RHS rows must match matrix size");
+
+  const int n = static_cast<int>(A.rows());
+  const int nrhs = static_cast<int>(B.cols());
+
+  if (n == 0 || nrhs == 0) {
+    if (!dst.empty()) dst.waitReady(ctx.stream());
+    dst.resize(n == 0 ? 0 : n, B.cols());
+    return;
+  }
+
+  A.waitReady(ctx.stream());
+  B.waitReady(ctx.stream());
+  if (!dst.empty()) dst.waitReady(ctx.stream());
+
+  // D2D copy B → dst (trsm is in-place on the RHS).
+  dst.resize(n, B.cols());
+  const size_t rhs_bytes = static_cast<size_t>(dst.rows()) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+  EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
+
+  constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+  Scalar alpha(1);
+
+  EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs,
+                                 &alpha, A.data(), static_cast<int>(A.rows()), dst.data(),
+                                 static_cast<int>(dst.rows())));
+
+  dst.recordReady(ctx.stream());
+}
+
+// ---- SYMM/HEMM dispatch -----------------------------------------------------
+// SymmExpr → cublasXsymm (real) or cublasXhemm (complex).
+// C = A * B where A is symmetric/Hermitian. Side=Left.
+
+template <typename Scalar, int UpLo>
+void dispatch_symm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SymmExpr<Scalar, UpLo>& expr) {
+  const DeviceMatrix<Scalar>& A = expr.matrix();
+  const DeviceMatrix<Scalar>& B = expr.rhs();
+
+  eigen_assert(A.rows() == A.cols() && "SYMM requires a square matrix");
+  eigen_assert(B.rows() == A.rows() && "SYMM: RHS rows must match matrix size");
+
+  const int m = static_cast<int>(A.rows());
+  const int n = static_cast<int>(B.cols());
+
+  if (m == 0 || n == 0) {
+    if (!dst.empty()) dst.waitReady(ctx.stream());
+    dst.resize(m == 0 ? 0 : m, B.cols());
+    return;
+  }
+
+  A.waitReady(ctx.stream());
+  B.waitReady(ctx.stream());
+  if (!dst.empty()) dst.waitReady(ctx.stream());
+
+  dst.resize(m, n);
+
+  constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+  Scalar alpha(1), beta(0);
+
+  EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &alpha, A.data(),
+                                 static_cast<int>(A.rows()), B.data(), static_cast<int>(B.rows()), &beta, dst.data(),
+                                 static_cast<int>(dst.rows())));
+
+  dst.recordReady(ctx.stream());
+}
+
+// ---- SYRK/HERK dispatch -----------------------------------------------------
+// SyrkExpr → cublasXsyrk (real) or cublasXherk (complex).
+// C = alpha * A * A^H + beta * C. UpLo specifies which triangle of C is stored.
+
+template <typename Scalar, int UpLo>
+void dispatch_syrk(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SyrkExpr<Scalar, UpLo>& expr,
+                   typename NumTraits<Scalar>::Real alpha_val, typename NumTraits<Scalar>::Real beta_val) {
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  const DeviceMatrix<Scalar>& A = expr.matrix();
+
+  const int n = static_cast<int>(A.rows());
+  const int k = static_cast<int>(A.cols());
+
+  if (n == 0) {
+    if (!dst.empty()) dst.waitReady(ctx.stream());
+    dst.resize(0, 0);
+    return;
+  }
+
+  A.waitReady(ctx.stream());
+  if (!dst.empty()) dst.waitReady(ctx.stream());
+
+  if (dst.empty() || dst.rows() != n || dst.cols() != n) {
+    dst.resize(n, n);
+    if (beta_val != RealScalar(0)) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
+    }
+  }
+
+  constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+
+  EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(),
+                                 static_cast<int>(A.rows()), &beta_val, dst.data(), static_cast<int>(dst.rows())));
+
+  dst.recordReady(ctx.stream());
+}
+
+}  // namespace internal
+
+// ---- DeviceAssignment: d_C.device(ctx) = expr ------------------------------
+// Returned by DeviceMatrix::device(ctx). Dispatches expressions to library calls.
+
+template <typename Scalar_>
+class DeviceAssignment {
+ public:
+  using Scalar = Scalar_;
+
+  DeviceAssignment(DeviceMatrix<Scalar>& dst, GpuContext& ctx) : dst_(dst), ctx_(ctx) {}
+
+  // operator= dispatches GEMM with beta=0 (overwrite).
+  template <typename Lhs, typename Rhs>
+  DeviceMatrix<Scalar>& operator=(const GemmExpr<Lhs, Rhs>& expr) {
+    internal::dispatch_gemm(ctx_, dst_, expr, Scalar(0));
+    return dst_;
+  }
+
+  // operator+= dispatches GEMM with beta=1 (accumulate).
+  template <typename Lhs, typename Rhs>
+  DeviceMatrix<Scalar>& operator+=(const GemmExpr<Lhs, Rhs>& expr) {
+    internal::dispatch_gemm(ctx_, dst_, expr, Scalar(1));
+    return dst_;
+  }
+
+  // operator-= dispatches GEMM with negated alpha, beta=1: C = C - alpha*op(A)*op(B).
+  template <typename Lhs, typename Rhs>
+  DeviceMatrix<Scalar>& operator-=(const GemmExpr<Lhs, Rhs>& expr) {
+    internal::dispatch_gemm(ctx_, dst_, expr, Scalar(1), Scalar(-1));
+    return dst_;
+  }
+
+  // operator= dispatches LLT solve (potrf + potrs).
+  template <int UpLo>
+  DeviceMatrix<Scalar>& operator=(const LltSolveExpr<Scalar, UpLo>& expr) {
+    internal::dispatch_llt_solve(ctx_, dst_, expr);
+    return dst_;
+  }
+
+  // operator= dispatches LU solve (getrf + getrs).
+  DeviceMatrix<Scalar>& operator=(const LuSolveExpr<Scalar>& expr) {
+    internal::dispatch_lu_solve(ctx_, dst_, expr);
+    return dst_;
+  }
+
+  // operator= dispatches TRSM (triangular solve).
+  template <int UpLo>
+  DeviceMatrix<Scalar>& operator=(const TrsmExpr<Scalar, UpLo>& expr) {
+    internal::dispatch_trsm(ctx_, dst_, expr);
+    return dst_;
+  }
+
+  // operator= dispatches SYMM/HEMM (symmetric/Hermitian multiply).
+  template <int UpLo>
+  DeviceMatrix<Scalar>& operator=(const SymmExpr<Scalar, UpLo>& expr) {
+    internal::dispatch_symm(ctx_, dst_, expr);
+    return dst_;
+  }
+
+  // Catch-all: static_assert for unsupported expressions.
+  template <typename Expr>
+  DeviceMatrix<Scalar>& operator=(const Expr&) {
+    static_assert(sizeof(Expr) == 0,
+                  "DeviceMatrix expression not supported: no cuBLAS/cuSOLVER mapping. "
+                  "Supported: GEMM (A*B), TRSM (.triangularView().solve()), "
+                  "SYMM (.selfadjointView()*B), LLT (.llt().solve()), LU (.lu().solve()).");
+    return dst_;
+  }
+
+ private:
+  DeviceMatrix<Scalar>& dst_;
+  GpuContext& ctx_;
+};
+
+// ---- Out-of-line DeviceMatrix expression operator= definitions -------------
+// These are declared in DeviceMatrix.h but defined here because they need
+// GpuContext::threadLocal() which requires the full GpuContext definition.
+
+template <typename Scalar_>
+template <typename Lhs, typename Rhs>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const GemmExpr<Lhs, Rhs>& expr) {
+  device(GpuContext::threadLocal()) = expr;
+  return *this;
+}
+
+template <typename Scalar_>
+template <typename Lhs, typename Rhs>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const GemmExpr<Lhs, Rhs>& expr) {
+  device(GpuContext::threadLocal()) += expr;
+  return *this;
+}
+
+template <typename Scalar_>
+template <int UpLo>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const LltSolveExpr<Scalar_, UpLo>& expr) {
+  device(GpuContext::threadLocal()) = expr;
+  return *this;
+}
+
+template <typename Scalar_>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const LuSolveExpr<Scalar_>& expr) {
+  device(GpuContext::threadLocal()) = expr;
+  return *this;
+}
+
+template <typename Scalar_>
+template <int UpLo>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const TrsmExpr<Scalar_, UpLo>& expr) {
+  device(GpuContext::threadLocal()) = expr;
+  return *this;
+}
+
+template <typename Scalar_>
+template <int UpLo>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const SymmExpr<Scalar_, UpLo>& expr) {
+  device(GpuContext::threadLocal()) = expr;
+  return *this;
+}
+
+// DeviceSelfAdjointView::rankUpdate — defined here because it needs GpuContext.
+template <typename Scalar_, int UpLo_>
+void DeviceSelfAdjointView<Scalar_, UpLo_>::rankUpdate(const DeviceMatrix<Scalar_>& A, RealScalar alpha) {
+  SyrkExpr<Scalar_, UpLo_> expr(A);
+  RealScalar beta = matrix().empty() ? RealScalar(0) : RealScalar(1);
+  internal::dispatch_syrk(GpuContext::threadLocal(), matrix(), expr, alpha, beta);
+}
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_DEVICE_DISPATCH_H
--- a/Eigen/src/GPU/DeviceExpr.h
+++ b/Eigen/src/GPU/DeviceExpr.h
@@ -0,0 +1,224 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Lightweight expression types for DeviceMatrix operations.
+//
+// These are NOT Eigen expression templates. Each type maps 1:1 to a single
+// NVIDIA library call (cuBLAS or cuSOLVER). There is no coefficient-level
+// evaluation, no lazy fusion, no packet operations.
+//
+// Expression types:
+//   DeviceAdjointView<S>  — d_A.adjoint()  → marks ConjTrans for GEMM
+//   DeviceTransposeView<S> — d_A.transpose() → marks Trans for GEMM
+//   DeviceScaled<Expr>    — alpha * expr    → carries scalar factor
+//   GemmExpr<Lhs, Rhs>   — lhs * rhs       → dispatches to cublasXgemm
+
+#ifndef EIGEN_GPU_DEVICE_EXPR_H
+#define EIGEN_GPU_DEVICE_EXPR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuBlasSupport.h"
+
+namespace Eigen {
+
+// Forward declaration.
+template <typename Scalar_>
+class DeviceMatrix;
+
+namespace internal {
+
+// ---- Traits: extract operation info from expression types -------------------
+
+// Default: a DeviceMatrix is NoTrans.
+template <typename T>
+struct device_expr_traits {
+  static constexpr bool is_device_expr = false;
+};
+
+template <typename Scalar>
+struct device_expr_traits<DeviceMatrix<Scalar>> {
+  using scalar_type = Scalar;
+  static constexpr GpuOp op = GpuOp::NoTrans;
+  static constexpr bool is_device_expr = true;
+  static const DeviceMatrix<Scalar>& matrix(const DeviceMatrix<Scalar>& x) { return x; }
+  static Scalar alpha(const DeviceMatrix<Scalar>&) { return Scalar(1); }
+};
+
+}  // namespace internal
+
+// ---- DeviceAdjointView: marks ConjTrans ------------------------------------
+// Returned by DeviceMatrix::adjoint(). Maps to cublasXgemm transA/B = C.
+
+template <typename Scalar_>
+class DeviceAdjointView {
+ public:
+  using Scalar = Scalar_;
+  explicit DeviceAdjointView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
+  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
+
+ private:
+  const DeviceMatrix<Scalar>& mat_;
+};
+
+namespace internal {
+template <typename Scalar>
+struct device_expr_traits<DeviceAdjointView<Scalar>> {
+  using scalar_type = Scalar;
+  static constexpr GpuOp op = GpuOp::ConjTrans;
+  static constexpr bool is_device_expr = true;
+  static const DeviceMatrix<Scalar>& matrix(const DeviceAdjointView<Scalar>& x) { return x.matrix(); }
+  static Scalar alpha(const DeviceAdjointView<Scalar>&) { return Scalar(1); }
+};
+}  // namespace internal
+
+// ---- DeviceTransposeView: marks Trans --------------------------------------
+// Returned by DeviceMatrix::transpose(). Maps to cublasXgemm transA/B = T.
+
+template <typename Scalar_>
+class DeviceTransposeView {
+ public:
+  using Scalar = Scalar_;
+  explicit DeviceTransposeView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
+  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
+
+ private:
+  const DeviceMatrix<Scalar>& mat_;
+};
+
+namespace internal {
+template <typename Scalar>
+struct device_expr_traits<DeviceTransposeView<Scalar>> {
+  using scalar_type = Scalar;
+  static constexpr GpuOp op = GpuOp::Trans;
+  static constexpr bool is_device_expr = true;
+  static const DeviceMatrix<Scalar>& matrix(const DeviceTransposeView<Scalar>& x) { return x.matrix(); }
+  static Scalar alpha(const DeviceTransposeView<Scalar>&) { return Scalar(1); }
+};
+}  // namespace internal
+
+// ---- DeviceScaled: alpha * expr --------------------------------------------
+// Returned by operator*(Scalar, DeviceMatrix/View). Carries the scalar factor.
+
+template <typename Inner>
+class DeviceScaled {
+ public:
+  using Scalar = typename internal::device_expr_traits<Inner>::scalar_type;
+  DeviceScaled(Scalar alpha, const Inner& inner) : alpha_(alpha), inner_(inner) {}
+  Scalar scalar() const { return alpha_; }
+  const Inner& inner() const { return inner_; }
+
+ private:
+  Scalar alpha_;
+  const Inner& inner_;
+};
+
+namespace internal {
+template <typename Inner>
+struct device_expr_traits<DeviceScaled<Inner>> {
+  using scalar_type = typename device_expr_traits<Inner>::scalar_type;
+  static constexpr GpuOp op = device_expr_traits<Inner>::op;
+  static constexpr bool is_device_expr = true;
+  static const DeviceMatrix<scalar_type>& matrix(const DeviceScaled<Inner>& x) {
+    return device_expr_traits<Inner>::matrix(x.inner());
+  }
+  static scalar_type alpha(const DeviceScaled<Inner>& x) {
+    return x.scalar() * device_expr_traits<Inner>::alpha(x.inner());
+  }
+};
+}  // namespace internal
+
+// ---- GemmExpr: lhs * rhs → cublasXgemm ------------------------------------
+// Returned by operator*(lhs_expr, rhs_expr). Dispatches to cuBLAS GEMM.
+
+template <typename Lhs, typename Rhs>
+class GemmExpr {
+ public:
+  using Scalar = typename internal::device_expr_traits<Lhs>::scalar_type;
+  static_assert(std::is_same<Scalar, typename internal::device_expr_traits<Rhs>::scalar_type>::value,
+                "DeviceMatrix GEMM: LHS and RHS must have the same scalar type");
+
+  GemmExpr(const Lhs& lhs, const Rhs& rhs) : lhs_(lhs), rhs_(rhs) {}
+  const Lhs& lhs() const { return lhs_; }
+  const Rhs& rhs() const { return rhs_; }
+
+ private:
+  // Stored by reference. Expression objects must not outlive their operands.
+  // This is safe for the one-liner pattern (d_C = d_A * d_B) since all
+  // temporaries live until the semicolon.
+  const Lhs& lhs_;
+  const Rhs& rhs_;
+};
+
+// ---- Free operator* overloads that produce GemmExpr ------------------------
+// These cover: DM*DM, Adj*DM, DM*Adj, Trans*DM, DM*Trans, Scaled*DM, etc.
+
+// DeviceMatrix * DeviceMatrix
+template <typename S>
+GemmExpr<DeviceMatrix<S>, DeviceMatrix<S>> operator*(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
+  return {a, b};
+}
+
+// AdjointView * DeviceMatrix
+template <typename S>
+GemmExpr<DeviceAdjointView<S>, DeviceMatrix<S>> operator*(const DeviceAdjointView<S>& a, const DeviceMatrix<S>& b) {
+  return {a, b};
+}
+
+// DeviceMatrix * AdjointView
+template <typename S>
+GemmExpr<DeviceMatrix<S>, DeviceAdjointView<S>> operator*(const DeviceMatrix<S>& a, const DeviceAdjointView<S>& b) {
+  return {a, b};
+}
+
+// TransposeView * DeviceMatrix
+template <typename S>
+GemmExpr<DeviceTransposeView<S>, DeviceMatrix<S>> operator*(const DeviceTransposeView<S>& a, const DeviceMatrix<S>& b) {
+  return {a, b};
+}
+
+// DeviceMatrix * TransposeView
+template <typename S>
+GemmExpr<DeviceMatrix<S>, DeviceTransposeView<S>> operator*(const DeviceMatrix<S>& a, const DeviceTransposeView<S>& b) {
+  return {a, b};
+}
+
+// Scaled * DeviceMatrix
+template <typename Inner, typename S>
+GemmExpr<DeviceScaled<Inner>, DeviceMatrix<S>> operator*(const DeviceScaled<Inner>& a, const DeviceMatrix<S>& b) {
+  return {a, b};
+}
+
+// DeviceMatrix * Scaled
+template <typename S, typename Inner>
+GemmExpr<DeviceMatrix<S>, DeviceScaled<Inner>> operator*(const DeviceMatrix<S>& a, const DeviceScaled<Inner>& b) {
+  return {a, b};
+}
+
+// ---- Scalar * DeviceMatrix / View → DeviceScaled ---------------------------
+
+template <typename S>
+DeviceScaled<DeviceMatrix<S>> operator*(S alpha, const DeviceMatrix<S>& m) {
+  return {alpha, m};
+}
+
+template <typename S>
+DeviceScaled<DeviceAdjointView<S>> operator*(S alpha, const DeviceAdjointView<S>& m) {
+  return {alpha, m};
+}
+
+template <typename S>
+DeviceScaled<DeviceTransposeView<S>> operator*(S alpha, const DeviceTransposeView<S>& m) {
+  return {alpha, m};
+}
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_DEVICE_EXPR_H
--- a/Eigen/src/GPU/DeviceMatrix.h
+++ b/Eigen/src/GPU/DeviceMatrix.h
@@ -0,0 +1,503 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Typed RAII wrapper for a dense matrix in GPU device memory.
+//
+// DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked
+// dimensions. Always dense (leading dimension = rows). It can be passed to GPU solvers
+// (GpuLLT, GpuLU, future cuBLAS/cuDSS) without host round-trips.
+//
+// Cross-stream safety is automatic: an internal CUDA event tracks when the
+// last write completed. Consumers on a different stream wait on that event
+// before reading.
+//
+// Usage:
+//   auto d_A = DeviceMatrix<double>::fromHost(A);   // upload (sync)
+//   GpuLLT<double> llt;
+//   llt.compute(d_A);                                // factor on device
+//   auto d_X = llt.solve(d_B);                       // async, no sync
+//   MatrixXd X = d_X.toHost();                       // download + block
+//
+// Async variants:
+//   auto d_A = DeviceMatrix<double>::fromHostAsync(A.data(), n, n, stream);
+//   auto transfer = d_X.toHostAsync(stream);         // enqueue D2H
+//   // ... overlap with other work ...
+//   MatrixXd X = transfer.get();                     // block + retrieve
+
+#ifndef EIGEN_GPU_DEVICE_MATRIX_H
+#define EIGEN_GPU_DEVICE_MATRIX_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSupport.h"
+
+namespace Eigen {
+
+// Forward declarations.
+template <typename, int>
+class GpuLLT;
+template <typename>
+class GpuLU;
+template <typename>
+class DeviceAdjointView;
+template <typename>
+class DeviceTransposeView;
+template <typename>
+class DeviceAssignment;
+template <typename, typename>
+class GemmExpr;
+template <typename, int>
+class LltSolveExpr;
+template <typename>
+class LuSolveExpr;
+template <typename, int>
+class DeviceLLTView;
+template <typename>
+class DeviceLUView;
+template <typename, int>
+class DeviceTriangularView;
+template <typename, int>
+class DeviceSelfAdjointView;
+template <typename, int>
+class ConstDeviceSelfAdjointView;
+template <typename, int>
+class TrsmExpr;
+template <typename, int>
+class SymmExpr;
+template <typename, int>
+class SyrkExpr;
+class GpuContext;
+
+// --------------------------------------------------------------------------
+// HostTransfer — future-like wrapper for an async device-to-host transfer.
+// --------------------------------------------------------------------------
+
+/** \ingroup GPU_Module
+ * \class HostTransfer
+ * \brief Future for an asynchronous device-to-host matrix transfer.
+ *
+ * Returned by DeviceMatrix::toHostAsync(). The transfer runs asynchronously
+ * on the given CUDA stream. Call get() to block until complete and retrieve
+ * the host matrix, or ready() to poll without blocking.
+ */
+template <typename Scalar_>
+class HostTransfer {
+ public:
+  using Scalar = Scalar_;
+  using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+
+  /** Block until the transfer completes and return the host matrix.
+   * Idempotent: subsequent calls return the same matrix without re-syncing. */
+  PlainMatrix& get() {
+    if (!synced_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaEventSynchronize(event_));
+      synced_ = true;
+    }
+    return host_buf_;
+  }
+
+  /** Non-blocking check: has the transfer completed? */
+  bool ready() const {
+    if (synced_) return true;
+    cudaError_t err = cudaEventQuery(event_);
+    if (err == cudaSuccess) return true;
+    eigen_assert(err == cudaErrorNotReady && "cudaEventQuery failed");
+    return false;
+  }
+
+  ~HostTransfer() {
+    if (event_) (void)cudaEventDestroy(event_);
+  }
+
+  HostTransfer(HostTransfer&& o) noexcept : host_buf_(std::move(o.host_buf_)), event_(o.event_), synced_(o.synced_) {
+    o.event_ = nullptr;
+    o.synced_ = true;
+  }
+
+  HostTransfer& operator=(HostTransfer&& o) noexcept {
+    if (this != &o) {
+      if (event_) (void)cudaEventDestroy(event_);
+      host_buf_ = std::move(o.host_buf_);
+      event_ = o.event_;
+      synced_ = o.synced_;
+      o.event_ = nullptr;
+      o.synced_ = true;
+    }
+    return *this;
+  }
+
+  HostTransfer(const HostTransfer&) = delete;
+  HostTransfer& operator=(const HostTransfer&) = delete;
+
+ private:
+  template <typename>
+  friend class DeviceMatrix;
+
+  HostTransfer(PlainMatrix&& buf, cudaEvent_t event) : host_buf_(std::move(buf)), event_(event), synced_(false) {}
+
+  PlainMatrix host_buf_;
+  cudaEvent_t event_ = nullptr;
+  bool synced_ = false;
+};
+
+// --------------------------------------------------------------------------
+// DeviceMatrix — typed RAII wrapper for a dense matrix in device memory.
+// --------------------------------------------------------------------------
+
+/** \ingroup GPU_Module
+ * \class DeviceMatrix
+ * \brief RAII wrapper for a dense column-major matrix in GPU device memory.
+ *
+ * \tparam Scalar_  Element type: float, double, complex<float>, complex<double>
+ *
+ * Owns a device allocation with tracked dimensions. Always dense
+ * (leading dimension = rows; no stride padding).
+ * An internal CUDA event records when the data was last written, enabling
+ * safe cross-stream consumption without user-visible synchronization.
+ *
+ * Each method has a synchronous and an asynchronous variant:
+ *  - fromHost() / fromHostAsync(): upload from host
+ *  - toHost() / toHostAsync(): download to host
+ */
+template <typename Scalar_>
+class DeviceMatrix {
+ public:
+  using Scalar = Scalar_;
+  using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+
+  // ---- Construction / destruction ------------------------------------------
+
+  /** Default: empty (0x0, no allocation). */
+  DeviceMatrix() = default;
+
+  /** Allocate uninitialized device memory for a rows x cols matrix. */
+  DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols) {
+    eigen_assert(rows >= 0 && cols >= 0);
+    size_t bytes = sizeInBytes();
+    if (bytes > 0) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
+    }
+  }
+
+  ~DeviceMatrix() {
+    if (data_) (void)cudaFree(data_);
+    if (ready_event_) (void)cudaEventDestroy(ready_event_);
+  }
+
+  // ---- Move-only -----------------------------------------------------------
+
+  DeviceMatrix(DeviceMatrix&& o) noexcept
+      : data_(o.data_),
+        rows_(o.rows_),
+        cols_(o.cols_),
+        ready_event_(o.ready_event_),
+        ready_stream_(o.ready_stream_),
+        retained_buffer_(std::move(o.retained_buffer_)) {
+    o.data_ = nullptr;
+    o.rows_ = 0;
+    o.cols_ = 0;
+    o.ready_event_ = nullptr;
+    o.ready_stream_ = nullptr;
+  }
+
+  DeviceMatrix& operator=(DeviceMatrix&& o) noexcept {
+    if (this != &o) {
+      if (data_) (void)cudaFree(data_);
+      if (ready_event_) (void)cudaEventDestroy(ready_event_);
+      data_ = o.data_;
+      rows_ = o.rows_;
+      cols_ = o.cols_;
+      ready_event_ = o.ready_event_;
+      ready_stream_ = o.ready_stream_;
+      retained_buffer_ = std::move(o.retained_buffer_);
+      o.data_ = nullptr;
+      o.rows_ = 0;
+      o.cols_ = 0;
+      o.ready_event_ = nullptr;
+      o.ready_stream_ = nullptr;
+    }
+    return *this;
+  }
+
+  DeviceMatrix(const DeviceMatrix&) = delete;
+  DeviceMatrix& operator=(const DeviceMatrix&) = delete;
+
+  // ---- Upload from host ----------------------------------------------------
+
+  /** Upload a host Eigen matrix to device memory (synchronous).
+   *
+   * Evaluates the expression into a contiguous ColMajor temporary, copies to
+   * device via cudaMemcpyAsync on \p stream, and synchronizes before returning.
+   *
+   * \param host   Any Eigen matrix expression.
+   * \param stream CUDA stream for the transfer (default: stream 0).
+   */
+  template <typename Derived>
+  static DeviceMatrix fromHost(const MatrixBase<Derived>& host, cudaStream_t stream = nullptr) {
+    const PlainMatrix mat(host.derived());
+    DeviceMatrix dm(mat.rows(), mat.cols());
+    if (dm.sizeInBytes() > 0) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dm.data_, mat.data(), dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
+    }
+    return dm;
+  }
+
+  /** Upload from a raw host pointer to device memory (asynchronous).
+   *
+   * Enqueues an async H2D copy on \p stream and records an internal event.
+   * The caller must keep \p host_data alive until the transfer completes
+   * (check via the internal event or synchronize the stream).
+   *
+   * \param host_data  Pointer to contiguous column-major host data.
+   * \param rows       Number of rows.
+   * \param cols       Number of columns.
+   * \param stream     CUDA stream for the transfer.
+   */
+  static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, cudaStream_t stream) {
+    eigen_assert(rows >= 0 && cols >= 0);
+    eigen_assert(host_data != nullptr || (rows == 0 || cols == 0));
+    DeviceMatrix dm(rows, cols);
+    if (dm.sizeInBytes() > 0) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dm.data_, host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
+      dm.recordReady(stream);
+    }
+    return dm;
+  }
+
+  // ---- Download to host ----------------------------------------------------
+
+  /** Download device matrix to host memory (synchronous).
+   *
+   * Waits on the internal ready event, enqueues a D2H copy on \p stream,
+   * synchronizes, and returns the host matrix directly.
+   *
+   * \param stream CUDA stream for the transfer (default: stream 0).
+   */
+  PlainMatrix toHost(cudaStream_t stream = nullptr) const {
+    PlainMatrix host_buf(rows_, cols_);
+    if (sizeInBytes() > 0) {
+      waitReady(stream);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(host_buf.data(), data_, sizeInBytes(), cudaMemcpyDeviceToHost, stream));
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
+    }
+    return host_buf;
+  }
+
+  /** Enqueue an async device-to-host transfer and return a future.
+   *
+   * Waits on the internal ready event (if any) to ensure the device data is
+   * valid, then enqueues the D2H copy on \p stream. Returns a HostTransfer
+   * future; call .get() to block and retrieve the host matrix.
+   *
+   * \param stream CUDA stream for the transfer (default: stream 0).
+   */
+  HostTransfer<Scalar> toHostAsync(cudaStream_t stream = nullptr) const {
+    PlainMatrix host_buf(rows_, cols_);
+    if (sizeInBytes() > 0) {
+      waitReady(stream);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(host_buf.data(), data_, sizeInBytes(), cudaMemcpyDeviceToHost, stream));
+    }
+    // Record a transfer-complete event.
+    cudaEvent_t transfer_event;
+    EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&transfer_event, cudaEventDisableTiming));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(transfer_event, stream));
+    return HostTransfer<Scalar>(std::move(host_buf), transfer_event);
+  }
+
+  // ---- Device-to-device copy -----------------------------------------------
+
+  /** Deep copy on device. Fully async — records event on the result, no sync.
+   *
+   * \param stream CUDA stream for the D2D copy (default: stream 0).
+   */
+  DeviceMatrix clone(cudaStream_t stream = nullptr) const {
+    DeviceMatrix result(rows_, cols_);
+    if (sizeInBytes() > 0) {
+      waitReady(stream);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data_, data_, sizeInBytes(), cudaMemcpyDeviceToDevice, stream));
+      result.recordReady(stream);
+    }
+    return result;
+  }
+
+  // ---- Resize (destructive) ------------------------------------------------
+
+  /** Discard contents and reallocate to (rows x cols). Clears the ready event. */
+  void resize(Index rows, Index cols) {
+    if (rows == rows_ && cols == cols_) return;
+    if (data_) {
+      (void)cudaFree(data_);
+      data_ = nullptr;
+    }
+    if (ready_event_) {
+      (void)cudaEventDestroy(ready_event_);
+      ready_event_ = nullptr;
+    }
+    ready_stream_ = nullptr;
+    retained_buffer_ = internal::DeviceBuffer();
+    rows_ = rows;
+    cols_ = cols;
+    size_t bytes = sizeInBytes();
+    if (bytes > 0) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
+    }
+  }
+
+  // ---- Accessors -----------------------------------------------------------
+
+  Scalar* data() { return data_; }
+  const Scalar* data() const { return data_; }
+  Index rows() const { return rows_; }
+  Index cols() const { return cols_; }
+  bool empty() const { return rows_ == 0 || cols_ == 0; }
+
+  /** Size of the device allocation in bytes. */
+  size_t sizeInBytes() const { return static_cast<size_t>(rows_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
+
+  // ---- Event synchronization (public for library dispatch interop) ---------
+
+  /** Record that device data is ready after work on \p stream. */
+  void recordReady(cudaStream_t stream) {
+    ensureEvent();
+    EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(ready_event_, stream));
+    ready_stream_ = stream;
+  }
+
+  /** Make \p stream wait until the device data is ready.
+   * No-op if no event recorded, or if the consumer stream is the same as the
+   * producer stream (CUDA guarantees in-order execution within a stream). */
+  void waitReady(cudaStream_t stream) const {
+    if (ready_event_ && stream != ready_stream_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamWaitEvent(stream, ready_event_, 0));
+    }
+  }
+
+  // ---- Expression methods (dispatch to cuBLAS/cuSOLVER) --------------------
+
+  /** Adjoint view for GEMM dispatch. Maps to cublasXgemm with ConjTrans. */
+  DeviceAdjointView<Scalar> adjoint() const { return DeviceAdjointView<Scalar>(*this); }
+
+  /** Transpose view for GEMM dispatch. Maps to cublasXgemm with Trans. */
+  DeviceTransposeView<Scalar> transpose() const { return DeviceTransposeView<Scalar>(*this); }
+
+  /** Bind this matrix to a GpuContext for expression assignment.
+   * Returns a DeviceAssignment proxy: `d_C.device(ctx) = d_A * d_B;` */
+  DeviceAssignment<Scalar> device(GpuContext& ctx) { return DeviceAssignment<Scalar>(*this, ctx); }
+
+  /** Assign from a GEMM expression using the thread-local default GpuContext.
+   * Defined out-of-line after GpuContext is fully declared (see DeviceDispatch.h). */
+  template <typename Lhs, typename Rhs>
+  DeviceMatrix& operator=(const GemmExpr<Lhs, Rhs>& expr);
+
+  /** Accumulate from a GEMM expression using the thread-local default GpuContext. */
+  template <typename Lhs, typename Rhs>
+  DeviceMatrix& operator+=(const GemmExpr<Lhs, Rhs>& expr);
+
+  /** Cholesky view: d_A.llt().solve(d_B) → LltSolveExpr. */
+  DeviceLLTView<Scalar, Lower> llt() const { return DeviceLLTView<Scalar, Lower>(*this); }
+
+  /** Cholesky view with explicit triangle: d_A.llt<Upper>().solve(d_B). */
+  template <int UpLo>
+  DeviceLLTView<Scalar, UpLo> llt() const {
+    return DeviceLLTView<Scalar, UpLo>(*this);
+  }
+
+  /** LU view: d_A.lu().solve(d_B) → LuSolveExpr. */
+  DeviceLUView<Scalar> lu() const { return DeviceLUView<Scalar>(*this); }
+
+  /** Assign from an LLT solve expression (thread-local default context). */
+  template <int UpLo>
+  DeviceMatrix& operator=(const LltSolveExpr<Scalar, UpLo>& expr);
+
+  /** Assign from an LU solve expression (thread-local default context). */
+  DeviceMatrix& operator=(const LuSolveExpr<Scalar>& expr);
+
+  /** Triangular view: d_A.triangularView<Lower>().solve(d_B) → TrsmExpr. */
+  template <int UpLo>
+  DeviceTriangularView<Scalar, UpLo> triangularView() const {
+    return DeviceTriangularView<Scalar, UpLo>(*this);
+  }
+
+  /** Self-adjoint view (mutable): d_C.selfadjointView<Lower>().rankUpdate(d_A). */
+  template <int UpLo>
+  DeviceSelfAdjointView<Scalar, UpLo> selfadjointView() {
+    return DeviceSelfAdjointView<Scalar, UpLo>(*this);
+  }
+
+  /** Self-adjoint view (const): d_A.selfadjointView<Lower>() * d_B → SymmExpr. */
+  template <int UpLo>
+  ConstDeviceSelfAdjointView<Scalar, UpLo> selfadjointView() const {
+    return ConstDeviceSelfAdjointView<Scalar, UpLo>(*this);
+  }
+
+  /** Assign from a TRSM expression (thread-local default context). */
+  template <int UpLo>
+  DeviceMatrix& operator=(const TrsmExpr<Scalar, UpLo>& expr);
+
+  /** Assign from a SYMM expression (thread-local default context). */
+  template <int UpLo>
+  DeviceMatrix& operator=(const SymmExpr<Scalar, UpLo>& expr);
+
+ private:
+  // ---- Private: adopt a raw device pointer (used by friend solvers) --------
+
+  DeviceMatrix(Scalar* device_ptr, Index rows, Index cols) : data_(device_ptr), rows_(rows), cols_(cols) {}
+
+  /** Transfer ownership of the device pointer out. Zeros internal state. */
+  Scalar* release() {
+    Scalar* p = data_;
+    data_ = nullptr;
+    rows_ = 0;
+    cols_ = 0;
+    if (ready_event_) {
+      (void)cudaEventDestroy(ready_event_);
+      ready_event_ = nullptr;
+    }
+    ready_stream_ = nullptr;
+    return p;
+  }
+
+  // ---- Private helpers -------------------------------------------------------
+
+  void ensureEvent() {
+    if (!ready_event_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&ready_event_, cudaEventDisableTiming));
+    }
+  }
+
+  void retainBuffer(internal::DeviceBuffer&& buffer) { retained_buffer_ = std::move(buffer); }
+
+  // ---- Friend declarations ------------------------------------------------
+
+  template <typename, int>
+  friend class GpuLLT;
+  template <typename>
+  friend class GpuLU;
+  template <typename>
+  friend class GpuQR;
+  template <typename>
+  friend class GpuSVD;
+  template <typename>
+  friend class GpuSelfAdjointEigenSolver;
+
+  // ---- Data members --------------------------------------------------------
+
+  Scalar* data_ = nullptr;
+  Index rows_ = 0;
+  Index cols_ = 0;
+  cudaEvent_t ready_event_ = nullptr;       // internal: tracks last write completion
+  cudaStream_t ready_stream_ = nullptr;     // stream that recorded ready_event_ (for same-stream skip)
+  internal::DeviceBuffer retained_buffer_;  // internal: keeps async aux buffers alive
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_DEVICE_MATRIX_H
--- a/Eigen/src/GPU/DeviceSolverExpr.h
+++ b/Eigen/src/GPU/DeviceSolverExpr.h
@@ -0,0 +1,115 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Solver expression types for DeviceMatrix.
+//
+// Each expression maps 1:1 to cuSOLVER library calls:
+//   LltSolveExpr  → cusolverDnXpotrf + cusolverDnXpotrs
+//   LuSolveExpr   → cusolverDnXgetrf + cusolverDnXgetrs
+//
+// Usage:
+//   d_X = d_A.llt().solve(d_B);              // Cholesky solve
+//   d_X.device(ctx) = d_A.lu().solve(d_B);   // LU solve on explicit stream
+
+#ifndef EIGEN_GPU_DEVICE_SOLVER_EXPR_H
+#define EIGEN_GPU_DEVICE_SOLVER_EXPR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// Forward declarations.
+template <typename Scalar_>
+class DeviceMatrix;
+class GpuContext;
+
+// ---- LLT solve expression ---------------------------------------------------
+// d_A.llt().solve(d_B) → LltSolveExpr → cusolverDnXpotrf + cusolverDnXpotrs
+
+template <typename Scalar_, int UpLo_ = Lower>
+class LltSolveExpr {
+ public:
+  using Scalar = Scalar_;
+  enum { UpLo = UpLo_ };
+
+  LltSolveExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
+  const DeviceMatrix<Scalar>& matrix() const { return A_; }
+  const DeviceMatrix<Scalar>& rhs() const { return B_; }
+
+ private:
+  const DeviceMatrix<Scalar>& A_;
+  const DeviceMatrix<Scalar>& B_;
+};
+
+// ---- LU solve expression ----------------------------------------------------
+// d_A.lu().solve(d_B) → LuSolveExpr → cusolverDnXgetrf + cusolverDnXgetrs
+
+template <typename Scalar_>
+class LuSolveExpr {
+ public:
+  using Scalar = Scalar_;
+
+  LuSolveExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
+  const DeviceMatrix<Scalar>& matrix() const { return A_; }
+  const DeviceMatrix<Scalar>& rhs() const { return B_; }
+
+ private:
+  const DeviceMatrix<Scalar>& A_;
+  const DeviceMatrix<Scalar>& B_;
+};
+
+// ---- DeviceLLTView: d_A.llt() → view with .solve() and .device() -----------
+
+template <typename Scalar_, int UpLo_ = Lower>
+class DeviceLLTView {
+ public:
+  using Scalar = Scalar_;
+
+  explicit DeviceLLTView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
+
+  /** Build a solve expression: d_A.llt().solve(d_B).
+   * The expression is evaluated when assigned to a DeviceMatrix. */
+  LltSolveExpr<Scalar, UpLo_> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
+
+  // For cached factorizations, use the explicit GpuLLT API directly:
+  //   GpuLLT<double> llt;
+  //   llt.compute(d_A);
+  //   auto d_X1 = llt.solve(d_B1);
+  //   auto d_X2 = llt.solve(d_B2);
+
+ private:
+  const DeviceMatrix<Scalar>& mat_;
+};
+
+// ---- DeviceLUView: d_A.lu() → view with .solve() and .device() -------------
+
+template <typename Scalar_>
+class DeviceLUView {
+ public:
+  using Scalar = Scalar_;
+
+  explicit DeviceLUView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
+
+  /** Build a solve expression: d_A.lu().solve(d_B). */
+  LuSolveExpr<Scalar> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
+
+  // For cached factorizations, use the explicit GpuLU API directly:
+  //   GpuLU<double> lu;
+  //   lu.compute(d_A);
+  //   auto d_X1 = lu.solve(d_B1);
+  //   auto d_X2 = lu.solve(d_B2);
+
+ private:
+  const DeviceMatrix<Scalar>& mat_;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_DEVICE_SOLVER_EXPR_H
--- a/Eigen/src/GPU/GpuContext.h
+++ b/Eigen/src/GPU/GpuContext.h
@@ -0,0 +1,83 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Unified GPU execution context.
+//
+// GpuContext owns a CUDA stream and all NVIDIA library handles (cuBLAS,
+// cuSOLVER, future cuDSS/cuSPARSE). It is the entry point for all GPU
+// operations on DeviceMatrix.
+//
+// Usage:
+//   GpuContext ctx;                        // explicit context
+//   d_C.device(ctx) = d_A * d_B;          // GEMM on ctx's stream
+//
+//   d_C = d_A * d_B;                      // thread-local default context
+//   GpuContext& ctx = GpuContext::threadLocal();
+
+#ifndef EIGEN_GPU_CONTEXT_H
+#define EIGEN_GPU_CONTEXT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuBlasSupport.h"
+#include "./CuSolverSupport.h"
+
+namespace Eigen {
+
+/** \ingroup GPU_Module
+ * \class GpuContext
+ * \brief Unified GPU execution context owning a CUDA stream and library handles.
+ *
+ * Each GpuContext instance creates a dedicated CUDA stream, a cuBLAS handle,
+ * and a cuSOLVER handle, all bound to that stream. Multiple contexts enable
+ * concurrent execution on independent streams.
+ *
+ * A lazily-created thread-local default is available via threadLocal() for
+ * simple single-stream usage.
+ */
+class GpuContext {
+ public:
+  GpuContext() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
+    EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
+  }
+
+  ~GpuContext() {
+    if (cusolver_) (void)cusolverDnDestroy(cusolver_);
+    if (cublas_) (void)cublasDestroy(cublas_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  // Non-copyable, non-movable (owns library handles).
+  GpuContext(const GpuContext&) = delete;
+  GpuContext& operator=(const GpuContext&) = delete;
+
+  /** Lazily-created thread-local default context. */
+  static GpuContext& threadLocal() {
+    thread_local GpuContext ctx;
+    return ctx;
+  }
+
+  cudaStream_t stream() const { return stream_; }
+  cublasHandle_t cublasHandle() const { return cublas_; }
+  cusolverDnHandle_t cusolverHandle() const { return cusolver_; }
+
+ private:
+  cudaStream_t stream_ = nullptr;
+  cublasHandle_t cublas_ = nullptr;
+  cusolverDnHandle_t cusolver_ = nullptr;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_CONTEXT_H
--- a/Eigen/src/GPU/GpuEigenSolver.h
+++ b/Eigen/src/GPU/GpuEigenSolver.h
@@ -0,0 +1,232 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// GPU self-adjoint eigenvalue decomposition using cuSOLVER.
+//
+// Wraps cusolverDnXsyevd (symmetric/Hermitian divide-and-conquer).
+// Stores eigenvalues and eigenvectors on device.
+//
+// Usage:
+//   GpuSelfAdjointEigenSolver<double> es(A);
+//   VectorXd eigenvals = es.eigenvalues();
+//   MatrixXd eigenvecs = es.eigenvectors();
+
+#ifndef EIGEN_GPU_EIGENSOLVER_H
+#define EIGEN_GPU_EIGENSOLVER_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuSolverSupport.h"
+#include <vector>
+
+namespace Eigen {
+
+template <typename Scalar_>
+class GpuSelfAdjointEigenSolver {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+  using RealVector = Matrix<RealScalar, Dynamic, 1>;
+
+  /** Eigenvalue-only or eigenvalues + eigenvectors. */
+  enum ComputeMode { EigenvaluesOnly, ComputeEigenvectors };
+
+  GpuSelfAdjointEigenSolver() { init_context(); }
+
+  template <typename InputType>
+  explicit GpuSelfAdjointEigenSolver(const EigenBase<InputType>& A, ComputeMode mode = ComputeEigenvectors) {
+    init_context();
+    compute(A, mode);
+  }
+
+  ~GpuSelfAdjointEigenSolver() {
+    if (handle_) (void)cusolverDnDestroy(handle_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  GpuSelfAdjointEigenSolver(const GpuSelfAdjointEigenSolver&) = delete;
+  GpuSelfAdjointEigenSolver& operator=(const GpuSelfAdjointEigenSolver&) = delete;
+
+  // ---- Factorization -------------------------------------------------------
+
+  template <typename InputType>
+  GpuSelfAdjointEigenSolver& compute(const EigenBase<InputType>& A, ComputeMode mode = ComputeEigenvectors) {
+    eigen_assert(A.rows() == A.cols() && "GpuSelfAdjointEigenSolver requires a square matrix");
+    mode_ = mode;
+    n_ = A.rows();
+    info_ = InvalidInput;
+    info_synced_ = false;
+
+    if (n_ == 0) {
+      info_ = Success;
+      info_synced_ = true;
+      return *this;
+    }
+
+    const PlainMatrix mat(A.derived());
+    lda_ = static_cast<int64_t>(n_);
+    const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
+
+    // syevd overwrites A with eigenvectors (if requested).
+    d_A_ = internal::DeviceBuffer(mat_bytes);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
+    factorize();
+    return *this;
+  }
+
+  GpuSelfAdjointEigenSolver& compute(const DeviceMatrix<Scalar>& d_A, ComputeMode mode = ComputeEigenvectors) {
+    eigen_assert(d_A.rows() == d_A.cols() && "GpuSelfAdjointEigenSolver requires a square matrix");
+    mode_ = mode;
+    n_ = d_A.rows();
+    info_ = InvalidInput;
+    info_synced_ = false;
+
+    if (n_ == 0) {
+      info_ = Success;
+      info_synced_ = true;
+      return *this;
+    }
+
+    d_A.waitReady(stream_);
+    lda_ = static_cast<int64_t>(n_);
+    const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
+
+    d_A_ = internal::DeviceBuffer(mat_bytes);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
+
+    factorize();
+    return *this;
+  }
+
+  // ---- Accessors -----------------------------------------------------------
+
+  ComputationInfo info() const {
+    sync_info();
+    return info_;
+  }
+
+  Index cols() const { return n_; }
+  Index rows() const { return n_; }
+
+  // TODO: Add device-side accessors (deviceEigenvalues(), deviceEigenvectors())
+  // returning DeviceMatrix views of the internal buffers, so users can chain
+  // GPU operations without round-tripping through host memory.
+
+  /** Eigenvalues in ascending order. Downloads from device. */
+  RealVector eigenvalues() const {
+    sync_info();
+    eigen_assert(info_ == Success);
+    RealVector W(n_);
+    if (n_ > 0) {
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpy(W.data(), d_W_.ptr, static_cast<size_t>(n_) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
+    }
+    return W;
+  }
+
+  /** Eigenvectors (columns). Downloads from device.
+   * Requires ComputeEigenvectors mode. */
+  PlainMatrix eigenvectors() const {
+    sync_info();
+    eigen_assert(info_ == Success);
+    eigen_assert(mode_ == ComputeEigenvectors && "eigenvectors() requires ComputeEigenvectors mode");
+    PlainMatrix V(n_, n_);
+    if (n_ > 0) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(V.data(), d_A_.ptr,
+                                          static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar),
+                                          cudaMemcpyDeviceToHost));
+    }
+    return V;
+  }
+
+  cudaStream_t stream() const { return stream_; }
+
+ private:
+  cudaStream_t stream_ = nullptr;
+  cusolverDnHandle_t handle_ = nullptr;
+  internal::CusolverParams params_;
+  internal::DeviceBuffer d_A_;        // overwritten with eigenvectors by syevd
+  internal::DeviceBuffer d_W_;        // eigenvalues (RealScalar, length n)
+  internal::DeviceBuffer d_scratch_;  // workspace + info
+  size_t scratch_size_ = 0;
+  std::vector<char> h_workspace_;
+  ComputeMode mode_ = ComputeEigenvectors;
+  Index n_ = 0;
+  int64_t lda_ = 0;
+  ComputationInfo info_ = InvalidInput;
+  int info_word_ = 0;
+  bool info_synced_ = true;
+
+  void init_context() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
+    ensure_scratch(0);
+  }
+
+  void ensure_scratch(size_t workspace_bytes) {
+    constexpr size_t kAlign = 16;
+    workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
+    size_t needed = workspace_bytes + sizeof(int);
+    if (needed > scratch_size_) {
+      if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      d_scratch_ = internal::DeviceBuffer(needed);
+      scratch_size_ = needed;
+    }
+  }
+
+  void* scratch_workspace() const { return d_scratch_.ptr; }
+  int* scratch_info() const {
+    return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
+  }
+
+  void sync_info() const {
+    if (!info_synced_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      const_cast<GpuSelfAdjointEigenSolver*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
+      const_cast<GpuSelfAdjointEigenSolver*>(this)->info_synced_ = true;
+    }
+  }
+
+  void factorize() {
+    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
+    constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
+
+    info_synced_ = false;
+    info_ = InvalidInput;
+
+    d_W_ = internal::DeviceBuffer(static_cast<size_t>(n_) * sizeof(RealScalar));
+
+    const cusolverEigMode_t jobz =
+        (mode_ == ComputeEigenvectors) ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
+
+    // Use lower triangle (standard convention).
+    constexpr cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
+
+    size_t dev_ws = 0, host_ws = 0;
+    EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd_bufferSize(handle_, params_.p, jobz, uplo, static_cast<int64_t>(n_), dtype,
+                                                     d_A_.ptr, lda_, rtype, d_W_.ptr, dtype, &dev_ws, &host_ws));
+
+    ensure_scratch(dev_ws);
+    h_workspace_.resize(host_ws);
+
+    EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd(handle_, params_.p, jobz, uplo, static_cast<int64_t>(n_), dtype, d_A_.ptr,
+                                          lda_, rtype, d_W_.ptr, dtype, scratch_workspace(), dev_ws,
+                                          host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
+
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_EIGENSOLVER_H
--- a/Eigen/src/GPU/GpuFFT.h
+++ b/Eigen/src/GPU/GpuFFT.h
@@ -0,0 +1,308 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// GPU FFT via cuFFT.
+//
+// Standalone GPU FFT class with plan caching. Supports 1D and 2D transforms:
+// C2C (complex-to-complex), R2C (real-to-complex), C2R (complex-to-real).
+//
+// Inverse transforms are scaled by 1/n (1D) or 1/(n*m) (2D) so that
+// inv(fwd(x)) == x, matching Eigen's FFT convention.
+//
+// cuFFT plans are cached by (size, type) and reused across calls.
+//
+// Usage:
+//   GpuFFT<float> fft;
+//   VectorXcf X = fft.fwd(x);         // 1D C2C or R2C
+//   VectorXcf y = fft.inv(X);         // 1D C2C inverse
+//   VectorXf  r = fft.invReal(X, n);  // 1D C2R inverse
+//   MatrixXcf B = fft.fwd2d(A);       // 2D C2C forward
+//   MatrixXcf C = fft.inv2d(B);       // 2D C2C inverse
+
+#ifndef EIGEN_GPU_FFT_H
+#define EIGEN_GPU_FFT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuFftSupport.h"
+#include "./CuBlasSupport.h"
+#include <map>
+
+namespace Eigen {
+
+template <typename Scalar_>
+class GpuFFT {
+ public:
+  using Scalar = Scalar_;
+  using Complex = std::complex<Scalar>;
+  using ComplexVector = Matrix<Complex, Dynamic, 1>;
+  using RealVector = Matrix<Scalar, Dynamic, 1>;
+  using ComplexMatrix = Matrix<Complex, Dynamic, Dynamic, ColMajor>;
+
+  GpuFFT() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
+    EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
+  }
+
+  ~GpuFFT() {
+    for (auto& kv : plans_) (void)cufftDestroy(kv.second);
+    if (cublas_) (void)cublasDestroy(cublas_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  GpuFFT(const GpuFFT&) = delete;
+  GpuFFT& operator=(const GpuFFT&) = delete;
+
+  // ---- 1D Complex-to-Complex ------------------------------------------------
+
+  /** Forward 1D C2C FFT. */
+  template <typename Derived>
+  ComplexVector fwd(const MatrixBase<Derived>& x,
+                    typename std::enable_if<NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
+    const ComplexVector input(x.derived());
+    const int n = static_cast<int>(input.size());
+    if (n == 0) return ComplexVector(0);
+
+    ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
+
+    cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
+    EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
+                                                      static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
+
+    ComplexVector result(n);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    return result;
+  }
+
+  /** Inverse 1D C2C FFT. Scaled by 1/n. */
+  template <typename Derived>
+  ComplexVector inv(const MatrixBase<Derived>& X) {
+    static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv() requires complex input");
+    const ComplexVector input(X.derived());
+    const int n = static_cast<int>(input.size());
+    if (n == 0) return ComplexVector(0);
+
+    ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
+
+    cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
+    EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
+                                                      static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
+
+    // Scale by 1/n.
+    scale_device(static_cast<Complex*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
+
+    ComplexVector result(n);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    return result;
+  }
+
+  // ---- 1D Real-to-Complex ---------------------------------------------------
+
+  /** Forward 1D R2C FFT. Returns n/2+1 complex values (half-spectrum). */
+  template <typename Derived>
+  ComplexVector fwd(const MatrixBase<Derived>& x,
+                    typename std::enable_if<!NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
+    const RealVector input(x.derived());
+    const int n = static_cast<int>(input.size());
+    if (n == 0) return ComplexVector(0);
+
+    const int n_complex = n / 2 + 1;
+    ensure_buffers(n * sizeof(Scalar), n_complex * sizeof(Complex));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
+
+    cufftHandle plan = get_plan_1d(n, internal::cufft_r2c_type<Scalar>::value);
+    EIGEN_CUFFT_CHECK(
+        internal::cufftExecR2C_dispatch(plan, static_cast<Scalar*>(d_in_.ptr), static_cast<Complex*>(d_out_.ptr)));
+
+    ComplexVector result(n_complex);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(result.data(), d_out_.ptr, n_complex * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    return result;
+  }
+
+  // ---- 1D Complex-to-Real ---------------------------------------------------
+
+  /** Inverse 1D C2R FFT. Input is n/2+1 complex values, output is nfft real values.
+   * Scaled by 1/nfft. Caller must specify nfft (original real signal length). */
+  template <typename Derived>
+  RealVector invReal(const MatrixBase<Derived>& X, Index nfft) {
+    static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "invReal() requires complex input");
+    const ComplexVector input(X.derived());
+    const int n = static_cast<int>(nfft);
+    const int n_complex = n / 2 + 1;
+    eigen_assert(input.size() == n_complex);
+    if (n == 0) return RealVector(0);
+
+    ensure_buffers(n_complex * sizeof(Complex), n * sizeof(Scalar));
+    // cuFFT C2R may overwrite the input, so we copy to d_in_.
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_in_.ptr, input.data(), n_complex * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
+
+    cufftHandle plan = get_plan_1d(n, internal::cufft_c2r_type<Scalar>::value);
+    EIGEN_CUFFT_CHECK(
+        internal::cufftExecC2R_dispatch(plan, static_cast<Complex*>(d_in_.ptr), static_cast<Scalar*>(d_out_.ptr)));
+
+    // Scale by 1/n.
+    scale_device_real(static_cast<Scalar*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
+
+    RealVector result(n);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    return result;
+  }
+
+  // ---- 2D Complex-to-Complex ------------------------------------------------
+
+  /** Forward 2D C2C FFT. Input and output are rows x cols complex matrices. */
+  template <typename Derived>
+  ComplexMatrix fwd2d(const MatrixBase<Derived>& A) {
+    static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "fwd2d() requires complex input");
+    const ComplexMatrix input(A.derived());
+    const int rows = static_cast<int>(input.rows());
+    const int cols = static_cast<int>(input.cols());
+    if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
+
+    const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
+    ensure_buffers(total, total);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
+
+    cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
+    EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
+                                                      static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
+
+    ComplexMatrix result(rows, cols);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    return result;
+  }
+
+  /** Inverse 2D C2C FFT. Scaled by 1/(rows*cols). */
+  template <typename Derived>
+  ComplexMatrix inv2d(const MatrixBase<Derived>& A) {
+    static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv2d() requires complex input");
+    const ComplexMatrix input(A.derived());
+    const int rows = static_cast<int>(input.rows());
+    const int cols = static_cast<int>(input.cols());
+    if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
+
+    const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
+    ensure_buffers(total, total);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
+
+    cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
+    EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
+                                                      static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
+
+    // Scale by 1/(rows*cols).
+    const int total_elems = rows * cols;
+    scale_device(static_cast<Complex*>(d_out_.ptr), total_elems, Scalar(1) / Scalar(total_elems));
+
+    ComplexMatrix result(rows, cols);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    return result;
+  }
+
+  // ---- Accessors ------------------------------------------------------------
+
+  cudaStream_t stream() const { return stream_; }
+
+ private:
+  cudaStream_t stream_ = nullptr;
+  cublasHandle_t cublas_ = nullptr;
+  std::map<int64_t, cufftHandle> plans_;
+  internal::DeviceBuffer d_in_;
+  internal::DeviceBuffer d_out_;
+  size_t d_in_size_ = 0;
+  size_t d_out_size_ = 0;
+
+  void ensure_buffers(size_t in_bytes, size_t out_bytes) {
+    if (in_bytes > d_in_size_) {
+      if (d_in_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      d_in_ = internal::DeviceBuffer(in_bytes);
+      d_in_size_ = in_bytes;
+    }
+    if (out_bytes > d_out_size_) {
+      if (d_out_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      d_out_ = internal::DeviceBuffer(out_bytes);
+      d_out_size_ = out_bytes;
+    }
+  }
+
+  // Plan key encoding: rank (1 bit) | type (4 bits) | dims
+  static int64_t plan_key_1d(int n, cufftType type) { return (int64_t(n) << 5) | (int64_t(type) << 1) | 0; }
+
+  static int64_t plan_key_2d(int rows, int cols, cufftType type) {
+    return (int64_t(rows) << 35) | (int64_t(cols) << 5) | (int64_t(type) << 1) | 1;
+  }
+
+  cufftHandle get_plan_1d(int n, cufftType type) {
+    int64_t key = plan_key_1d(n, type);
+    auto it = plans_.find(key);
+    if (it != plans_.end()) return it->second;
+
+    cufftHandle plan;
+    EIGEN_CUFFT_CHECK(cufftPlan1d(&plan, n, type, /*batch=*/1));
+    EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
+    plans_[key] = plan;
+    return plan;
+  }
+
+  cufftHandle get_plan_2d(int rows, int cols, cufftType type) {
+    int64_t key = plan_key_2d(rows, cols, type);
+    auto it = plans_.find(key);
+    if (it != plans_.end()) return it->second;
+
+    // cuFFT uses row-major (C order) for 2D: first dim = rows, second = cols.
+    // Eigen matrices are column-major, so we pass (cols, rows) to cuFFT
+    // to get the correct 2D transform.
+    cufftHandle plan;
+    EIGEN_CUFFT_CHECK(cufftPlan2d(&plan, cols, rows, type));
+    EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
+    plans_[key] = plan;
+    return plan;
+  }
+
+  // Scale complex array on device using cuBLAS scal.
+  void scale_device(Complex* d_ptr, int n, Scalar alpha) { scale_complex(cublas_, d_ptr, n, alpha); }
+
+  // Scale real array on device using cuBLAS scal.
+  void scale_device_real(Scalar* d_ptr, int n, Scalar alpha) { scale_real(cublas_, d_ptr, n, alpha); }
+
+  // Type-dispatched cuBLAS scal wrappers (C++14 compatible).
+  static void scale_complex(cublasHandle_t h, std::complex<float>* p, int n, float a) {
+    EIGEN_CUBLAS_CHECK(cublasCsscal(h, n, &a, reinterpret_cast<cuComplex*>(p), 1));
+  }
+  static void scale_complex(cublasHandle_t h, std::complex<double>* p, int n, double a) {
+    EIGEN_CUBLAS_CHECK(cublasZdscal(h, n, &a, reinterpret_cast<cuDoubleComplex*>(p), 1));
+  }
+  static void scale_real(cublasHandle_t h, float* p, int n, float a) {
+    EIGEN_CUBLAS_CHECK(cublasSscal(h, n, &a, p, 1));
+  }
+  static void scale_real(cublasHandle_t h, double* p, int n, double a) {
+    EIGEN_CUBLAS_CHECK(cublasDscal(h, n, &a, p, 1));
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_FFT_H
--- a/Eigen/src/GPU/GpuLLT.h
+++ b/Eigen/src/GPU/GpuLLT.h
@@ -0,0 +1,385 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// GPU Cholesky (LLT) decomposition using cuSOLVER.
+//
+// Unlike Eigen's CPU LLT<MatrixType>, GpuLLT keeps the factored Cholesky
+// factor in device memory for the lifetime of the object. Multiple solves
+// against the same factor therefore only transfer the RHS and solution
+// vectors, not the factor itself.
+//
+// Requires CUDA 11.0+ (cusolverDnXpotrf / cusolverDnXpotrs generic API).
+// Requires CUDA 11.4+ (cusolverDnX generic API + cudaMallocAsync).
+//
+// Usage:
+//   GpuLLT<double> llt(A);              // upload A, potrf, L stays on device
+//   if (llt.info() != Success) { ... }
+//   MatrixXd x1 = llt.solve(b1);        // potrs, only b1 transferred
+//   MatrixXd x2 = llt.solve(b2);        // L already on device
+
+#ifndef EIGEN_GPU_LLT_H
+#define EIGEN_GPU_LLT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuSolverSupport.h"
+#include <vector>
+
+namespace Eigen {
+
+/** \ingroup GPU_Module
+ * \class GpuLLT
+ * \brief GPU Cholesky (LL^T) decomposition via cuSOLVER
+ *
+ * \tparam Scalar_  Element type: float, double, complex<float>, complex<double>
+ * \tparam UpLo_    Triangle used: Lower (default) or Upper
+ *
+ * Factorizes a symmetric positive-definite matrix A = LL^H on the GPU and
+ * caches the factor L in device memory. Each subsequent solve(B) uploads only
+ * B, calls cusolverDnXpotrs, and downloads the result — the factor is not
+ * re-transferred.
+ *
+ * Each GpuLLT object owns a dedicated CUDA stream and cuSOLVER handle,
+ * enabling concurrent factorizations from multiple objects on the same host
+ * thread.
+ */
+template <typename Scalar_, int UpLo_ = Lower>
+class GpuLLT {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+
+  enum { UpLo = UpLo_ };
+
+  // ---- Construction / destruction ------------------------------------------
+
+  /** Default constructor. Does not factorize; call compute() before solve(). */
+  GpuLLT() { init_context(); }
+
+  /** Factor A immediately. Equivalent to GpuLLT llt; llt.compute(A). */
+  template <typename InputType>
+  explicit GpuLLT(const EigenBase<InputType>& A) {
+    init_context();
+    compute(A);
+  }
+
+  ~GpuLLT() {
+    // Ignore errors in destructors — cannot propagate.
+    if (handle_) (void)cusolverDnDestroy(handle_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  // Non-copyable (owns device memory and library handles).
+  GpuLLT(const GpuLLT&) = delete;
+  GpuLLT& operator=(const GpuLLT&) = delete;
+
+  // Movable.
+  GpuLLT(GpuLLT&& o) noexcept
+      : stream_(o.stream_),
+        handle_(o.handle_),
+        params_(std::move(o.params_)),
+        d_factor_(std::move(o.d_factor_)),
+        factor_alloc_size_(o.factor_alloc_size_),
+        d_scratch_(std::move(o.d_scratch_)),
+        scratch_size_(o.scratch_size_),
+        h_workspace_(std::move(o.h_workspace_)),
+        n_(o.n_),
+        lda_(o.lda_),
+        info_(o.info_),
+        info_word_(o.info_word_),
+        info_synced_(o.info_synced_) {
+    o.stream_ = nullptr;
+    o.handle_ = nullptr;
+    o.factor_alloc_size_ = 0;
+    o.scratch_size_ = 0;
+    o.n_ = 0;
+    o.info_ = InvalidInput;
+    o.info_word_ = 0;
+    o.info_synced_ = true;
+  }
+
+  GpuLLT& operator=(GpuLLT&& o) noexcept {
+    if (this != &o) {
+      if (handle_) (void)cusolverDnDestroy(handle_);
+      if (stream_) (void)cudaStreamDestroy(stream_);
+      stream_ = o.stream_;
+      handle_ = o.handle_;
+      params_ = std::move(o.params_);
+      d_factor_ = std::move(o.d_factor_);
+      factor_alloc_size_ = o.factor_alloc_size_;
+      d_scratch_ = std::move(o.d_scratch_);
+      scratch_size_ = o.scratch_size_;
+      h_workspace_ = std::move(o.h_workspace_);
+      n_ = o.n_;
+      lda_ = o.lda_;
+      info_ = o.info_;
+      info_word_ = o.info_word_;
+      info_synced_ = o.info_synced_;
+      o.stream_ = nullptr;
+      o.handle_ = nullptr;
+      o.factor_alloc_size_ = 0;
+      o.scratch_size_ = 0;
+      o.n_ = 0;
+      o.info_ = InvalidInput;
+      o.info_word_ = 0;
+      o.info_synced_ = true;
+    }
+    return *this;
+  }
+
+  // ---- Factorization -------------------------------------------------------
+
+  /** Compute the Cholesky factorization of A (host matrix).
+   *
+   * Uploads A to device memory, calls cusolverDnXpotrf, and retains the
+   * factored matrix on device. Any previous factorization is overwritten.
+   */
+  template <typename InputType>
+  GpuLLT& compute(const EigenBase<InputType>& A) {
+    eigen_assert(A.rows() == A.cols());
+    if (!begin_compute(A.rows())) return *this;
+
+    // Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions).
+    const PlainMatrix mat(A.derived());
+    lda_ = static_cast<int64_t>(mat.rows());
+    allocate_factor_storage();
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_factor_.ptr, mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_));
+
+    factorize();
+    return *this;
+  }
+
+  /** Compute the Cholesky factorization from a device-resident matrix (D2D copy). */
+  GpuLLT& compute(const DeviceMatrix<Scalar>& d_A) {
+    eigen_assert(d_A.rows() == d_A.cols());
+    if (!begin_compute(d_A.rows())) return *this;
+
+    lda_ = static_cast<int64_t>(d_A.rows());
+    d_A.waitReady(stream_);
+    allocate_factor_storage();
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_factor_.ptr, d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, stream_));
+
+    factorize();
+    return *this;
+  }
+
+  /** Compute the Cholesky factorization from a device matrix (move, no copy). */
+  GpuLLT& compute(DeviceMatrix<Scalar>&& d_A) {
+    eigen_assert(d_A.rows() == d_A.cols());
+    if (!begin_compute(d_A.rows())) return *this;
+
+    lda_ = static_cast<int64_t>(d_A.rows());
+    d_A.waitReady(stream_);
+    d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
+
+    factorize();
+    return *this;
+  }
+
+  // ---- Solve ---------------------------------------------------------------
+
+  /** Solve A * X = B using the cached Cholesky factor (host → host).
+   *
+   * Uploads B to device memory, calls cusolverDnXpotrs using the factor
+   * retained from compute(), and returns the solution X on the host.
+   * The factor is not re-transferred; only B goes up and X comes down.
+   *
+   * \pre compute() must have been called and info() == Success.
+   * \returns X such that A * X ≈ B
+   */
+  template <typename Rhs>
+  PlainMatrix solve(const MatrixBase<Rhs>& B) const {
+    const_cast<GpuLLT*>(this)->sync_info();
+    eigen_assert(info_ == Success && "GpuLLT::solve called on a failed or uninitialized factorization");
+    eigen_assert(B.rows() == n_);
+
+    const PlainMatrix rhs(B);
+    const int64_t nrhs = static_cast<int64_t>(rhs.cols());
+    const int64_t ldb = static_cast<int64_t>(rhs.rows());
+    DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpyAsync(d_x_ptr, rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
+    });
+
+    PlainMatrix X(n_, B.cols());
+    int solve_info = 0;
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+
+    eigen_assert(solve_info == 0 && "cusolverDnXpotrs reported an error");
+    return X;
+  }
+
+  /** Solve A * X = B with device-resident RHS. Fully async.
+   *
+   * All work is enqueued on this solver's stream. Returns a DeviceMatrix
+   * with a recorded ready event — no host synchronization occurs.
+   * The caller should check info() after compute() to verify the
+   * factorization succeeded; this method does not check.
+   */
+  DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
+    eigen_assert(d_B.rows() == n_);
+    d_B.waitReady(stream_);
+    const int64_t nrhs = static_cast<int64_t>(d_B.cols());
+    const int64_t ldb = static_cast<int64_t>(d_B.rows());
+    return solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpyAsync(d_x_ptr, d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
+    });
+  }
+
+  // ---- Accessors -----------------------------------------------------------
+
+  /** Returns Success if the last compute() succeeded, NumericalIssue otherwise.
+   * Lazily synchronizes the stream on first call after compute(). */
+  ComputationInfo info() const {
+    const_cast<GpuLLT*>(this)->sync_info();
+    return info_;
+  }
+
+  Index rows() const { return n_; }
+  Index cols() const { return n_; }
+
+  /** Returns the CUDA stream owned by this object.
+   *  Advanced users may submit additional GPU work on this stream
+   *  to overlap with or chain after GpuLLT operations. */
+  cudaStream_t stream() const { return stream_; }
+
+ private:
+  cudaStream_t stream_ = nullptr;
+  cusolverDnHandle_t handle_ = nullptr;
+  internal::CusolverParams params_;   // cuSOLVER params (created once, reused)
+  internal::DeviceBuffer d_factor_;   // factored L (or U) on device (grows, never shrinks)
+  size_t factor_alloc_size_ = 0;      // current d_factor_ allocation size
+  internal::DeviceBuffer d_scratch_;  // combined workspace + info word (grows, never shrinks)
+  size_t scratch_size_ = 0;           // current scratch allocation size
+  std::vector<char> h_workspace_;     // host workspace (kept alive until next compute)
+  Index n_ = 0;
+  int64_t lda_ = 0;
+  ComputationInfo info_ = InvalidInput;
+  int info_word_ = 0;        // host-side target for async info download
+  bool info_synced_ = true;  // has the stream been synced for info?
+
+  bool begin_compute(Index rows) {
+    n_ = rows;
+    info_ = InvalidInput;
+    if (n_ == 0) {
+      info_ = Success;
+      return false;
+    }
+    return true;
+  }
+
+  size_t factorBytes() const { return rhsBytes(static_cast<int64_t>(n_), lda_); }
+
+  static size_t rhsBytes(int64_t cols, int64_t outer_stride) {
+    return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
+  }
+
+  void allocate_factor_storage() {
+    size_t needed = factorBytes();
+    if (needed > factor_alloc_size_) {
+      d_factor_ = internal::DeviceBuffer(needed);
+      factor_alloc_size_ = needed;
+    }
+  }
+
+  // Ensure d_scratch_ is at least `workspace_bytes + sizeof(int)`.
+  // Layout: [workspace (workspace_bytes) | info_word (sizeof(int))].
+  // Ensure d_scratch_ can hold workspace_bytes + an aligned info word.
+  // Grows but never shrinks. Syncs the stream before reallocating to
+  // avoid freeing memory that async kernels may still be using.
+  void ensure_scratch(size_t workspace_bytes) {
+    // Round up so the info word is naturally aligned.
+    // 16-byte alignment for optimal GPU memory access.
+    constexpr size_t kAlign = 16;
+    workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
+    size_t needed = workspace_bytes + sizeof(int);
+    if (needed > scratch_size_) {
+      if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      d_scratch_ = internal::DeviceBuffer(needed);
+      scratch_size_ = needed;
+    }
+  }
+
+  void* scratch_workspace() const { return d_scratch_.ptr; }
+  int* scratch_info() const {
+    return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
+  }
+
+  template <typename CopyRhs>
+  DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, CopyRhs&& copy_rhs) const {
+    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
+    constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_, ColMajor>::value;
+
+    Scalar* d_x_ptr = nullptr;
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_x_ptr), rhsBytes(nrhs, ldb)));
+    copy_rhs(d_x_ptr);
+
+    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, static_cast<int64_t>(n_), nrhs, dtype,
+                                          d_factor_.ptr, lda_, dtype, d_x_ptr, ldb, scratch_info()));
+
+    DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs));
+    result.recordReady(stream_);
+    return result;
+  }
+
+  void init_context() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
+    ensure_scratch(0);  // allocate at least the info word
+  }
+
+  // Synchronize stream and interpret the info word. No-op if already synced.
+  void sync_info() {
+    if (!info_synced_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      info_ = (info_word_ == 0) ? Success : NumericalIssue;
+      info_synced_ = true;
+    }
+  }
+
+  // Run cusolverDnXpotrf on d_factor_ (already on device).
+  // Enqueues factorization + async info download. Does NOT sync.
+  // Workspaces are stored as members to ensure they outlive the async kernels.
+  void factorize() {
+    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
+    constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_, ColMajor>::value;
+
+    info_synced_ = false;
+    info_ = InvalidInput;
+
+    size_t dev_ws_bytes = 0, host_ws_bytes = 0;
+    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(handle_, params_.p, uplo, static_cast<int64_t>(n_), dtype,
+                                                     d_factor_.ptr, lda_, dtype, &dev_ws_bytes, &host_ws_bytes));
+
+    ensure_scratch(dev_ws_bytes);
+    h_workspace_.resize(host_ws_bytes);
+
+    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(
+        handle_, params_.p, uplo, static_cast<int64_t>(n_), dtype, d_factor_.ptr, lda_, dtype, scratch_workspace(),
+        dev_ws_bytes, host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info()));
+
+    // Enqueue async download of info word — sync deferred to info() or solve().
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_LLT_H
--- a/Eigen/src/GPU/GpuLU.h
+++ b/Eigen/src/GPU/GpuLU.h
@@ -0,0 +1,371 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// GPU partial-pivoting LU decomposition using cuSOLVER.
+//
+// Wraps cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve).
+// The factored LU matrix and pivot array are kept in device memory for the
+// lifetime of the object, so repeated solves only transfer the RHS/solution.
+//
+// Requires CUDA 11.0+ (cusolverDnX generic API).
+//
+// Usage:
+//   GpuLU<double> lu(A);              // upload A, getrf, LU+ipiv on device
+//   if (lu.info() != Success) { ... }
+//   MatrixXd x = lu.solve(b);         // getrs NoTrans, only b transferred
+//   MatrixXd xt = lu.solve(b, GpuLU<double>::Transpose);   // A^T x = b
+
+#ifndef EIGEN_GPU_LU_H
+#define EIGEN_GPU_LU_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuSolverSupport.h"
+#include <vector>
+
+namespace Eigen {
+
+/** \ingroup GPU_Module
+ * \class GpuLU
+ * \brief GPU LU decomposition with partial pivoting via cuSOLVER
+ *
+ * \tparam Scalar_  Element type: float, double, complex<float>, complex<double>
+ *
+ * Decomposes a square matrix A = P L U on the GPU and retains the factored
+ * matrix and pivot array in device memory. Solves A*X=B, A^T*X=B, or
+ * A^H*X=B by passing the appropriate TransposeMode.
+ *
+ * Each GpuLU object owns a dedicated CUDA stream and cuSOLVER handle.
+ */
+template <typename Scalar_>
+class GpuLU {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+
+  /** Controls which system is solved in solve(). */
+  enum TransposeMode {
+    NoTranspose,        ///< Solve A   * X = B
+    Transpose,          ///< Solve A^T * X = B
+    ConjugateTranspose  ///< Solve A^H * X = B (same as Transpose for real types)
+  };
+
+  // ---- Construction / destruction ------------------------------------------
+
+  GpuLU() { init_context(); }
+
+  template <typename InputType>
+  explicit GpuLU(const EigenBase<InputType>& A) {
+    init_context();
+    compute(A);
+  }
+
+  ~GpuLU() {
+    if (handle_) (void)cusolverDnDestroy(handle_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  GpuLU(const GpuLU&) = delete;
+  GpuLU& operator=(const GpuLU&) = delete;
+
+  GpuLU(GpuLU&& o) noexcept
+      : stream_(o.stream_),
+        handle_(o.handle_),
+        params_(std::move(o.params_)),
+        d_lu_(std::move(o.d_lu_)),
+        lu_alloc_size_(o.lu_alloc_size_),
+        d_ipiv_(std::move(o.d_ipiv_)),
+        d_scratch_(std::move(o.d_scratch_)),
+        scratch_size_(o.scratch_size_),
+        h_workspace_(std::move(o.h_workspace_)),
+        n_(o.n_),
+        lda_(o.lda_),
+        info_(o.info_),
+        info_word_(o.info_word_),
+        info_synced_(o.info_synced_) {
+    o.stream_ = nullptr;
+    o.handle_ = nullptr;
+    o.lu_alloc_size_ = 0;
+    o.scratch_size_ = 0;
+    o.n_ = 0;
+    o.info_ = InvalidInput;
+    o.info_word_ = 0;
+    o.info_synced_ = true;
+  }
+
+  GpuLU& operator=(GpuLU&& o) noexcept {
+    if (this != &o) {
+      if (handle_) (void)cusolverDnDestroy(handle_);
+      if (stream_) (void)cudaStreamDestroy(stream_);
+      stream_ = o.stream_;
+      handle_ = o.handle_;
+      params_ = std::move(o.params_);
+      d_lu_ = std::move(o.d_lu_);
+      lu_alloc_size_ = o.lu_alloc_size_;
+      d_ipiv_ = std::move(o.d_ipiv_);
+      d_scratch_ = std::move(o.d_scratch_);
+      scratch_size_ = o.scratch_size_;
+      h_workspace_ = std::move(o.h_workspace_);
+      n_ = o.n_;
+      lda_ = o.lda_;
+      info_ = o.info_;
+      info_word_ = o.info_word_;
+      info_synced_ = o.info_synced_;
+      o.stream_ = nullptr;
+      o.handle_ = nullptr;
+      o.lu_alloc_size_ = 0;
+      o.scratch_size_ = 0;
+      o.n_ = 0;
+      o.info_ = InvalidInput;
+      o.info_word_ = 0;
+      o.info_synced_ = true;
+    }
+    return *this;
+  }
+
+  // ---- Factorization -------------------------------------------------------
+
+  /** Compute the LU factorization of A (host matrix, must be square). */
+  template <typename InputType>
+  GpuLU& compute(const EigenBase<InputType>& A) {
+    eigen_assert(A.rows() == A.cols() && "GpuLU requires a square matrix");
+    if (!begin_compute(A.rows())) return *this;
+
+    const PlainMatrix mat(A.derived());
+    lda_ = static_cast<int64_t>(mat.rows());
+    allocate_lu_storage();
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_));
+
+    factorize();
+    return *this;
+  }
+
+  /** Compute the LU factorization from a device-resident matrix (D2D copy). */
+  GpuLU& compute(const DeviceMatrix<Scalar>& d_A) {
+    eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
+    if (!begin_compute(d_A.rows())) return *this;
+
+    lda_ = static_cast<int64_t>(d_A.rows());
+    d_A.waitReady(stream_);
+    allocate_lu_storage();
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_));
+
+    factorize();
+    return *this;
+  }
+
+  /** Compute the LU factorization from a device matrix (move, no copy). */
+  GpuLU& compute(DeviceMatrix<Scalar>&& d_A) {
+    eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
+    if (!begin_compute(d_A.rows())) return *this;
+
+    lda_ = static_cast<int64_t>(d_A.rows());
+    d_A.waitReady(stream_);
+    d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
+
+    factorize();
+    return *this;
+  }
+
+  // ---- Solve ---------------------------------------------------------------
+
+  /** Solve op(A) * X = B using the cached LU factorization (host → host).
+   *
+   * \param B    Right-hand side (n x nrhs host matrix).
+   * \param mode NoTranspose (default), Transpose, or ConjugateTranspose.
+   */
+  template <typename Rhs>
+  PlainMatrix solve(const MatrixBase<Rhs>& B, TransposeMode mode = NoTranspose) const {
+    const_cast<GpuLU*>(this)->sync_info();
+    eigen_assert(info_ == Success && "GpuLU::solve called on a failed or uninitialized factorization");
+    eigen_assert(B.rows() == n_);
+
+    const PlainMatrix rhs(B);
+    const int64_t nrhs = static_cast<int64_t>(rhs.cols());
+    const int64_t ldb = static_cast<int64_t>(rhs.rows());
+    DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpyAsync(d_x_ptr, rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
+    });
+
+    PlainMatrix X(n_, B.cols());
+    int solve_info = 0;
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(X.data(), d_X.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+
+    eigen_assert(solve_info == 0 && "cusolverDnXgetrs reported an error");
+    return X;
+  }
+
+  /** Solve op(A) * X = B with device-resident RHS. Fully async. */
+  DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B, TransposeMode mode = NoTranspose) const {
+    eigen_assert(d_B.rows() == n_);
+    d_B.waitReady(stream_);
+    const int64_t nrhs = static_cast<int64_t>(d_B.cols());
+    const int64_t ldb = static_cast<int64_t>(d_B.rows());
+    return solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpyAsync(d_x_ptr, d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
+    });
+  }
+
+  // ---- Accessors -----------------------------------------------------------
+
+  /** Lazily synchronizes the stream on first call after compute(). */
+  ComputationInfo info() const {
+    const_cast<GpuLU*>(this)->sync_info();
+    return info_;
+  }
+  Index rows() const { return n_; }
+  Index cols() const { return n_; }
+  cudaStream_t stream() const { return stream_; }
+
+ private:
+  cudaStream_t stream_ = nullptr;
+  cusolverDnHandle_t handle_ = nullptr;
+  internal::CusolverParams params_;   // cuSOLVER params (created once, reused)
+  internal::DeviceBuffer d_lu_;       // LU factors on device (grows, never shrinks)
+  size_t lu_alloc_size_ = 0;          // current d_lu_ allocation size
+  internal::DeviceBuffer d_ipiv_;     // pivot indices (int64_t) on device
+  internal::DeviceBuffer d_scratch_;  // combined workspace + info word (grows, never shrinks)
+  size_t scratch_size_ = 0;           // current scratch allocation size
+  std::vector<char> h_workspace_;     // host workspace (kept alive until next compute)
+  Index n_ = 0;
+  int64_t lda_ = 0;
+  ComputationInfo info_ = InvalidInput;
+  int info_word_ = 0;        // host-side target for async info download
+  bool info_synced_ = true;  // has the stream been synced for info?
+
+  bool begin_compute(Index rows) {
+    n_ = rows;
+    info_ = InvalidInput;
+    if (n_ == 0) {
+      info_ = Success;
+      return false;
+    }
+    return true;
+  }
+
+  size_t matrixBytes() const { return matrixBytes(static_cast<int64_t>(n_), lda_); }
+
+  static size_t matrixBytes(int64_t cols, int64_t outer_stride) {
+    return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
+  }
+
+  void allocate_lu_storage() {
+    size_t needed = matrixBytes();
+    if (needed > lu_alloc_size_) {
+      d_lu_ = internal::DeviceBuffer(needed);
+      lu_alloc_size_ = needed;
+    }
+  }
+
+  // Ensure d_scratch_ is at least `workspace_bytes + sizeof(int)`.
+  // Layout: [workspace (workspace_bytes) | info_word (sizeof(int))].
+  // Ensure d_scratch_ can hold workspace_bytes + an aligned info word.
+  // Grows but never shrinks. Syncs the stream before reallocating to
+  // avoid freeing memory that async kernels may still be using.
+  void ensure_scratch(size_t workspace_bytes) {
+    constexpr size_t kAlign = 16;
+    workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
+    size_t needed = workspace_bytes + sizeof(int);
+    if (needed > scratch_size_) {
+      if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      d_scratch_ = internal::DeviceBuffer(needed);
+      scratch_size_ = needed;
+    }
+  }
+
+  void* scratch_workspace() const { return d_scratch_.ptr; }
+  int* scratch_info() const {
+    return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
+  }
+
+  template <typename CopyRhs>
+  DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, TransposeMode mode, CopyRhs&& copy_rhs) const {
+    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
+    const cublasOperation_t trans = to_cublas_op(mode);
+
+    Scalar* d_x_ptr = nullptr;
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_x_ptr), matrixBytes(nrhs, ldb)));
+    copy_rhs(d_x_ptr);
+
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(handle_, params_.p, trans, static_cast<int64_t>(n_), nrhs, dtype, d_lu_.ptr,
+                                          lda_, static_cast<const int64_t*>(d_ipiv_.ptr), dtype, d_x_ptr, ldb,
+                                          scratch_info()));
+
+    DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs));
+    result.recordReady(stream_);
+    return result;
+  }
+
+  void init_context() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
+    ensure_scratch(0);  // allocate at least the info word
+  }
+
+  void sync_info() {
+    if (!info_synced_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      info_ = (info_word_ == 0) ? Success : NumericalIssue;
+      info_synced_ = true;
+    }
+  }
+
+  // Run cusolverDnXgetrf on d_lu_ (already on device). Allocates d_ipiv_.
+  // Enqueues factorization + async info download. Does NOT sync.
+  // Workspaces are stored as members to ensure they outlive the async kernels.
+  void factorize() {
+    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
+    const size_t ipiv_bytes = static_cast<size_t>(n_) * sizeof(int64_t);
+
+    info_synced_ = false;
+    info_ = InvalidInput;
+
+    d_ipiv_ = internal::DeviceBuffer(ipiv_bytes);
+
+    size_t dev_ws_bytes = 0, host_ws_bytes = 0;
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(handle_, params_.p, static_cast<int64_t>(n_),
+                                                     static_cast<int64_t>(n_), dtype, d_lu_.ptr, lda_, dtype,
+                                                     &dev_ws_bytes, &host_ws_bytes));
+
+    ensure_scratch(dev_ws_bytes);
+    h_workspace_.resize(host_ws_bytes);
+
+    EIGEN_CUSOLVER_CHECK(
+        cusolverDnXgetrf(handle_, params_.p, static_cast<int64_t>(n_), static_cast<int64_t>(n_), dtype, d_lu_.ptr, lda_,
+                         static_cast<int64_t*>(d_ipiv_.ptr), dtype, scratch_workspace(), dev_ws_bytes,
+                         host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info()));
+
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
+  }
+
+  static cublasOperation_t to_cublas_op(TransposeMode mode) {
+    switch (mode) {
+      case Transpose:
+        return CUBLAS_OP_T;
+      case ConjugateTranspose:
+        return CUBLAS_OP_C;
+      default:
+        return CUBLAS_OP_N;
+    }
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_LU_H
--- a/Eigen/src/GPU/GpuQR.h
+++ b/Eigen/src/GPU/GpuQR.h
@@ -0,0 +1,389 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// GPU QR decomposition using cuSOLVER.
+//
+// Wraps cusolverDnXgeqrf (factorization), cusolverDnXormqr (apply Q),
+// cusolverDnXorgqr (form Q), and cublasXtrsm (triangular solve on R).
+//
+// The factored matrix (reflectors + R) and tau stay in device memory.
+// Solve uses ormqr + trsm without forming Q explicitly.
+//
+// Usage:
+//   GpuQR<double> qr(A);              // upload A, geqrf
+//   if (qr.info() != Success) { ... }
+//   MatrixXd X = qr.solve(B);         // Q^H * B via ormqr, then trsm on R
+//
+// Expression syntax:
+//   d_X = d_A.qr().solve(d_B);        // temporary, no caching
+
+#ifndef EIGEN_GPU_QR_H
+#define EIGEN_GPU_QR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuSolverSupport.h"
+#include "./CuBlasSupport.h"
+#include <vector>
+
+namespace Eigen {
+
+template <typename Scalar_>
+class GpuQR {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+
+  GpuQR() { init_context(); }
+
+  template <typename InputType>
+  explicit GpuQR(const EigenBase<InputType>& A) {
+    init_context();
+    compute(A);
+  }
+
+  ~GpuQR() {
+    if (handle_) (void)cusolverDnDestroy(handle_);
+    if (cublas_) (void)cublasDestroy(cublas_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  GpuQR(const GpuQR&) = delete;
+  GpuQR& operator=(const GpuQR&) = delete;
+
+  GpuQR(GpuQR&& o) noexcept
+      : stream_(o.stream_),
+        handle_(o.handle_),
+        cublas_(o.cublas_),
+        params_(std::move(o.params_)),
+        d_qr_(std::move(o.d_qr_)),
+        d_tau_(std::move(o.d_tau_)),
+        d_scratch_(std::move(o.d_scratch_)),
+        scratch_size_(o.scratch_size_),
+        h_workspace_(std::move(o.h_workspace_)),
+        m_(o.m_),
+        n_(o.n_),
+        lda_(o.lda_),
+        info_(o.info_),
+        info_word_(o.info_word_),
+        info_synced_(o.info_synced_) {
+    o.stream_ = nullptr;
+    o.handle_ = nullptr;
+    o.cublas_ = nullptr;
+    o.scratch_size_ = 0;
+    o.m_ = 0;
+    o.n_ = 0;
+    o.lda_ = 0;
+    o.info_ = InvalidInput;
+    o.info_word_ = 0;
+    o.info_synced_ = true;
+  }
+
+  GpuQR& operator=(GpuQR&& o) noexcept {
+    if (this != &o) {
+      if (handle_) (void)cusolverDnDestroy(handle_);
+      if (cublas_) (void)cublasDestroy(cublas_);
+      if (stream_) (void)cudaStreamDestroy(stream_);
+      stream_ = o.stream_;
+      handle_ = o.handle_;
+      cublas_ = o.cublas_;
+      params_ = std::move(o.params_);
+      d_qr_ = std::move(o.d_qr_);
+      d_tau_ = std::move(o.d_tau_);
+      d_scratch_ = std::move(o.d_scratch_);
+      scratch_size_ = o.scratch_size_;
+      h_workspace_ = std::move(o.h_workspace_);
+      m_ = o.m_;
+      n_ = o.n_;
+      lda_ = o.lda_;
+      info_ = o.info_;
+      info_word_ = o.info_word_;
+      info_synced_ = o.info_synced_;
+      o.stream_ = nullptr;
+      o.handle_ = nullptr;
+      o.cublas_ = nullptr;
+      o.scratch_size_ = 0;
+      o.m_ = 0;
+      o.n_ = 0;
+      o.lda_ = 0;
+      o.info_ = InvalidInput;
+      o.info_word_ = 0;
+      o.info_synced_ = true;
+    }
+    return *this;
+  }
+
+  // ---- Factorization -------------------------------------------------------
+
+  template <typename InputType>
+  GpuQR& compute(const EigenBase<InputType>& A) {
+    m_ = A.rows();
+    n_ = A.cols();
+    info_ = InvalidInput;
+    info_synced_ = false;
+
+    if (m_ == 0 || n_ == 0) {
+      info_ = Success;
+      info_synced_ = true;
+      return *this;
+    }
+
+    const PlainMatrix mat(A.derived());
+    lda_ = static_cast<int64_t>(mat.rows());
+    const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
+    const size_t tau_bytes = static_cast<size_t>((std::min)(m_, n_)) * sizeof(Scalar);
+
+    d_qr_ = internal::DeviceBuffer(mat_bytes);
+    d_tau_ = internal::DeviceBuffer(tau_bytes);
+
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_qr_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
+
+    factorize();
+    return *this;
+  }
+
+  GpuQR& compute(const DeviceMatrix<Scalar>& d_A) {
+    m_ = d_A.rows();
+    n_ = d_A.cols();
+    info_ = InvalidInput;
+    info_synced_ = false;
+
+    if (m_ == 0 || n_ == 0) {
+      info_ = Success;
+      info_synced_ = true;
+      return *this;
+    }
+
+    lda_ = static_cast<int64_t>(d_A.rows());
+    const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
+    const size_t tau_bytes = static_cast<size_t>((std::min)(m_, n_)) * sizeof(Scalar);
+
+    d_A.waitReady(stream_);
+    d_qr_ = internal::DeviceBuffer(mat_bytes);
+    d_tau_ = internal::DeviceBuffer(tau_bytes);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_qr_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
+
+    factorize();
+    return *this;
+  }
+
+  // ---- Solve ---------------------------------------------------------------
+
+  /** Solve A * X = B via QR: X = R^{-1} * Q^H * B (least-squares for m >= n).
+   * Uses ormqr (apply Q^H) + trsm (solve R), without forming Q explicitly.
+   * Requires m >= n (overdetermined or square). Underdetermined not supported.
+   *
+   * TODO: Add device-side accessor for the R factor (and Q application) as
+   * DeviceMatrix, so users can chain GPU operations without host round-trips. */
+  template <typename Rhs>
+  PlainMatrix solve(const MatrixBase<Rhs>& B) const {
+    sync_info();
+    eigen_assert(info_ == Success && "GpuQR::solve called on a failed or uninitialized factorization");
+    eigen_assert(B.rows() == m_);
+    eigen_assert(m_ >= n_ && "GpuQR::solve requires m >= n (use SVD for underdetermined systems)");
+
+    const PlainMatrix rhs(B);
+    const int64_t nrhs = static_cast<int64_t>(rhs.cols());
+    const int64_t ldb = static_cast<int64_t>(rhs.rows());  // = m_
+    const size_t b_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+
+    // Upload B to device (m × nrhs buffer).
+    internal::DeviceBuffer d_B(b_bytes);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.ptr, rhs.data(), b_bytes, cudaMemcpyHostToDevice, stream_));
+
+    // Apply Q^H to B in-place: d_B becomes m × nrhs, first n rows hold Q^H * B relevant part.
+    apply_QH(d_B.ptr, ldb, nrhs);
+
+    // Solve R * X = (Q^H * B)[0:n,:] via trsm on the first n rows.
+    Scalar alpha(1);
+    EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
+                                             CUBLAS_DIAG_NON_UNIT, static_cast<int>(n_), static_cast<int>(nrhs), &alpha,
+                                             static_cast<const Scalar*>(d_qr_.ptr), static_cast<int>(lda_),
+                                             static_cast<Scalar*>(d_B.ptr), static_cast<int>(ldb)));
+
+    // Download the first n rows of each column (stride = ldb = m, width = n).
+    PlainMatrix X(n_, rhs.cols());
+    if (m_ == n_) {
+      // Square: dense copy, no stride mismatch.
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_B.ptr,
+                                               static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar),
+                                               cudaMemcpyDeviceToHost, stream_));
+    } else {
+      // Overdetermined: 2D copy to extract first n rows from each column.
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(
+          X.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_B.ptr, static_cast<size_t>(ldb) * sizeof(Scalar),
+          static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs), cudaMemcpyDeviceToHost, stream_));
+    }
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    return X;
+  }
+
+  /** Solve with device-resident RHS. Returns n × nrhs DeviceMatrix. */
+  DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
+    sync_info();
+    eigen_assert(info_ == Success && "GpuQR::solve called on a failed or uninitialized factorization");
+    eigen_assert(d_B.rows() == m_);
+    eigen_assert(m_ >= n_ && "GpuQR::solve requires m >= n (use SVD for underdetermined systems)");
+    d_B.waitReady(stream_);
+
+    const int64_t nrhs = static_cast<int64_t>(d_B.cols());
+    const int64_t ldb = static_cast<int64_t>(d_B.rows());  // = m_
+    const size_t b_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+
+    // D2D copy B into working buffer (ormqr and trsm are in-place).
+    internal::DeviceBuffer d_work(b_bytes);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_work.ptr, d_B.data(), b_bytes, cudaMemcpyDeviceToDevice, stream_));
+
+    apply_QH(d_work.ptr, ldb, nrhs);
+
+    // trsm on the first n rows.
+    Scalar alpha(1);
+    EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
+                                             CUBLAS_DIAG_NON_UNIT, static_cast<int>(n_), static_cast<int>(nrhs), &alpha,
+                                             static_cast<const Scalar*>(d_qr_.ptr), static_cast<int>(lda_),
+                                             static_cast<Scalar*>(d_work.ptr), static_cast<int>(ldb)));
+
+    if (m_ == n_) {
+      // Square: result is the whole buffer, dense.
+      DeviceMatrix<Scalar> result(static_cast<Scalar*>(d_work.ptr), n_, static_cast<Index>(nrhs));
+      d_work.ptr = nullptr;  // transfer ownership
+      result.recordReady(stream_);
+      return result;
+    } else {
+      // Overdetermined: copy first n rows of each column into a dense n × nrhs result.
+      DeviceMatrix<Scalar> result(n_, static_cast<Index>(nrhs));
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(result.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_work.ptr,
+                                                 static_cast<size_t>(ldb) * sizeof(Scalar),
+                                                 static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs),
+                                                 cudaMemcpyDeviceToDevice, stream_));
+      result.recordReady(stream_);
+      return result;
+      // d_work freed here via RAII — safe because stream is ordered.
+    }
+  }
+
+  // ---- Accessors -----------------------------------------------------------
+
+  ComputationInfo info() const {
+    sync_info();
+    return info_;
+  }
+
+  Index rows() const { return m_; }
+  Index cols() const { return n_; }
+  cudaStream_t stream() const { return stream_; }
+
+ private:
+  cudaStream_t stream_ = nullptr;
+  cusolverDnHandle_t handle_ = nullptr;
+  cublasHandle_t cublas_ = nullptr;
+  internal::CusolverParams params_;
+  internal::DeviceBuffer d_qr_;       // QR factors (reflectors in lower, R in upper)
+  internal::DeviceBuffer d_tau_;      // Householder scalars (min(m,n))
+  internal::DeviceBuffer d_scratch_;  // workspace + info word
+  size_t scratch_size_ = 0;
+  std::vector<char> h_workspace_;
+  Index m_ = 0;
+  Index n_ = 0;
+  int64_t lda_ = 0;
+  ComputationInfo info_ = InvalidInput;
+  int info_word_ = 0;
+  bool info_synced_ = true;
+
+  void init_context() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
+    EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
+    EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
+    ensure_scratch(0);
+  }
+
+  void ensure_scratch(size_t workspace_bytes) {
+    constexpr size_t kAlign = 16;
+    workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
+    size_t needed = workspace_bytes + sizeof(int);
+    if (needed > scratch_size_) {
+      if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      d_scratch_ = internal::DeviceBuffer(needed);
+      scratch_size_ = needed;
+    }
+  }
+
+  void* scratch_workspace() const { return d_scratch_.ptr; }
+  int* scratch_info() const {
+    return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
+  }
+
+  void sync_info() const {
+    if (!info_synced_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      const_cast<GpuQR*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
+      const_cast<GpuQR*>(this)->info_synced_ = true;
+    }
+  }
+
+  void factorize() {
+    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
+
+    info_synced_ = false;
+    info_ = InvalidInput;
+
+    size_t dev_ws = 0, host_ws = 0;
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf_bufferSize(handle_, params_.p, static_cast<int64_t>(m_),
+                                                     static_cast<int64_t>(n_), dtype, d_qr_.ptr, lda_, dtype,
+                                                     d_tau_.ptr, dtype, &dev_ws, &host_ws));
+
+    ensure_scratch(dev_ws);
+    h_workspace_.resize(host_ws);
+
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf(handle_, params_.p, static_cast<int64_t>(m_), static_cast<int64_t>(n_), dtype,
+                                          d_qr_.ptr, lda_, dtype, d_tau_.ptr, dtype, scratch_workspace(), dev_ws,
+                                          host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
+
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
+  }
+
+  // Apply Q^H to a device buffer in-place: d_B = Q^H * d_B.
+  // Uses type-specific ormqr (real) or unmqr (complex) wrappers from CuSolverSupport.h.
+  // For real types: Q^H = Q^T, use CUBLAS_OP_T. For complex: use CUBLAS_OP_C.
+  void apply_QH(void* d_B, int64_t ldb, int64_t nrhs) const {
+    const int im = static_cast<int>(m_);
+    const int in = static_cast<int>(nrhs);
+    const int ik = static_cast<int>((std::min)(m_, n_));
+    const int ilda = static_cast<int>(lda_);
+    const int ildb = static_cast<int>(ldb);
+    constexpr cublasOperation_t trans = NumTraits<Scalar>::IsComplex ? CUBLAS_OP_C : CUBLAS_OP_T;
+
+    int lwork = 0;
+    EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr_bufferSize(
+        handle_, CUBLAS_SIDE_LEFT, trans, im, in, ik, static_cast<const Scalar*>(d_qr_.ptr), ilda,
+        static_cast<const Scalar*>(d_tau_.ptr), static_cast<const Scalar*>(d_B), ildb, &lwork));
+
+    internal::DeviceBuffer d_work(static_cast<size_t>(lwork) * sizeof(Scalar));
+
+    EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr(handle_, CUBLAS_SIDE_LEFT, trans, im, in, ik,
+                                                    static_cast<const Scalar*>(d_qr_.ptr), ilda,
+                                                    static_cast<const Scalar*>(d_tau_.ptr), static_cast<Scalar*>(d_B),
+                                                    ildb, static_cast<Scalar*>(d_work.ptr), lwork, scratch_info()));
+
+    // Sync to ensure workspace can be freed safely, and check ormqr info.
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    int ormqr_info = 0;
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(&ormqr_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost));
+    eigen_assert(ormqr_info == 0 && "cusolverDnXormqr reported an error");
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_QR_H
--- a/Eigen/src/GPU/GpuSVD.h
+++ b/Eigen/src/GPU/GpuSVD.h
@@ -0,0 +1,490 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// GPU SVD decomposition using cuSOLVER (divide-and-conquer).
+//
+// Wraps cusolverDnXgesvd. Stores U, S, VT on device. Solve uses
+// cuBLAS GEMM: X = VT^H * diag(D) * U^H * B.
+//
+// cuSOLVER returns VT (not V). We store and expose VT directly.
+//
+// Usage:
+//   GpuSVD<double> svd(A, ComputeThinU | ComputeThinV);
+//   VectorXd S = svd.singularValues();
+//   MatrixXd U = svd.matrixU();       // m×k or m×m
+//   MatrixXd VT = svd.matrixVT();      // k×n or n×n (this is V^T)
+//   MatrixXd X = svd.solve(B);        // pseudoinverse
+//   MatrixXd X = svd.solve(B, k);     // truncated (top k triplets)
+//   MatrixXd X = svd.solve(B, 0.1);   // Tikhonov regularized
+
+#ifndef EIGEN_GPU_SVD_H
+#define EIGEN_GPU_SVD_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuSolverSupport.h"
+#include "./CuBlasSupport.h"
+#include <vector>
+
+namespace Eigen {
+
+template <typename Scalar_>
+class GpuSVD {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+  using RealVector = Matrix<RealScalar, Dynamic, 1>;
+
+  GpuSVD() { init_context(); }
+
+  template <typename InputType>
+  explicit GpuSVD(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
+    init_context();
+    compute(A, options);
+  }
+
+  ~GpuSVD() {
+    if (handle_) (void)cusolverDnDestroy(handle_);
+    if (cublas_) (void)cublasDestroy(cublas_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  GpuSVD(const GpuSVD&) = delete;
+  GpuSVD& operator=(const GpuSVD&) = delete;
+  // Move constructors omitted for brevity — follow GpuQR pattern.
+
+  // ---- Factorization -------------------------------------------------------
+
+  template <typename InputType>
+  GpuSVD& compute(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
+    options_ = options;
+    m_ = A.rows();
+    n_ = A.cols();
+    info_ = InvalidInput;
+    info_synced_ = false;
+
+    if (m_ == 0 || n_ == 0) {
+      info_ = Success;
+      info_synced_ = true;
+      return *this;
+    }
+
+    // cuSOLVER gesvd requires m >= n. For wide matrices, transpose internally.
+    transposed_ = (m_ < n_);
+    const PlainMatrix mat = transposed_ ? PlainMatrix(A.derived().adjoint()) : PlainMatrix(A.derived());
+    if (transposed_) std::swap(m_, n_);
+
+    lda_ = static_cast<int64_t>(mat.rows());
+    const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
+
+    // Copy (possibly transposed) A to device (gesvd overwrites it).
+    d_A_ = internal::DeviceBuffer(mat_bytes);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
+
+    factorize();
+    return *this;
+  }
+
+  GpuSVD& compute(const DeviceMatrix<Scalar>& d_A, unsigned int options = ComputeThinU | ComputeThinV) {
+    options_ = options;
+    m_ = d_A.rows();
+    n_ = d_A.cols();
+    info_ = InvalidInput;
+    info_synced_ = false;
+
+    if (m_ == 0 || n_ == 0) {
+      info_ = Success;
+      info_synced_ = true;
+      return *this;
+    }
+
+    transposed_ = (m_ < n_);
+    d_A.waitReady(stream_);
+
+    if (transposed_) {
+      // Transpose on device via cuBLAS geam: d_A_ = A^H.
+      std::swap(m_, n_);
+      lda_ = m_;
+      const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
+      d_A_ = internal::DeviceBuffer(mat_bytes);
+      Scalar alpha_one(1), beta_zero(0);
+      // geam: C(m×n) = alpha * op(A) + beta * op(B). Use B = nullptr trick: beta=0.
+      // A is the original d_A (n_orig × m_orig = n × m after swap), transposed → m × n.
+      EIGEN_CUBLAS_CHECK(internal::cublasXgeam(
+          cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(m_), static_cast<int>(n_), &alpha_one, d_A.data(),
+          static_cast<int>(d_A.rows()), &beta_zero, static_cast<const Scalar*>(nullptr), static_cast<int>(m_),
+          static_cast<Scalar*>(d_A_.ptr), static_cast<int>(m_)));
+    } else {
+      lda_ = static_cast<int64_t>(d_A.rows());
+      const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
+      d_A_ = internal::DeviceBuffer(mat_bytes);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
+    }
+
+    factorize();
+    return *this;
+  }
+
+  // ---- Accessors -----------------------------------------------------------
+
+  ComputationInfo info() const {
+    sync_info();
+    return info_;
+  }
+
+  Index rows() const { return transposed_ ? n_ : m_; }
+  Index cols() const { return transposed_ ? m_ : n_; }
+
+  // TODO: Add device-side accessors (deviceU(), deviceVT(), deviceSingularValues())
+  // returning DeviceMatrix views of the internal buffers, so users can chain
+  // GPU operations without round-tripping through host memory.
+
+  /** Singular values (always available). Downloads from device on each call. */
+  RealVector singularValues() const {
+    sync_info();
+    eigen_assert(info_ == Success);
+    const Index k = (std::min)(m_, n_);
+    RealVector S(k);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpy(S.data(), d_S_.ptr, static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
+    return S;
+  }
+
+  /** Left singular vectors U. Returns m_orig × k or m_orig × m_orig.
+   * For transposed case (m_orig < n_orig), U comes from cuSOLVER's VT. */
+  PlainMatrix matrixU() const {
+    sync_info();
+    eigen_assert(info_ == Success);
+    eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "matrixU() requires ComputeThinU or ComputeFullU");
+    const Index m_orig = transposed_ ? n_ : m_;
+    const Index n_orig = transposed_ ? m_ : n_;
+    const Index k = (std::min)(m_orig, n_orig);
+    if (!transposed_) {
+      const Index ucols = (options_ & ComputeFullU) ? m_ : k;
+      PlainMatrix U(m_, ucols);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U.data(), d_U_.ptr,
+                                          static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
+                                          cudaMemcpyDeviceToHost));
+      return U;
+    } else {
+      // Transposed: U_orig = VT_stored^H. VT_stored is vtrows × n_ (= vtrows × m_orig).
+      const Index vtrows = (options_ & ComputeFullU) ? m_orig : k;  // Note: FullU maps to FullV of A^H
+      PlainMatrix VT_stored(vtrows, n_);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT_stored.data(), d_VT_.ptr,
+                                          static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
+                                          cudaMemcpyDeviceToHost));
+      return VT_stored.adjoint();  // m_orig × vtrows
+    }
+  }
+
+  /** Right singular vectors transposed V^T. Returns k × n_orig or n_orig × n_orig.
+   * For transposed case, VT comes from cuSOLVER's U. */
+  PlainMatrix matrixVT() const {
+    sync_info();
+    eigen_assert(info_ == Success);
+    eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "matrixVT() requires ComputeThinV or ComputeFullV");
+    const Index m_orig = transposed_ ? n_ : m_;
+    const Index n_orig = transposed_ ? m_ : n_;
+    const Index k = (std::min)(m_orig, n_orig);
+    if (!transposed_) {
+      const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
+      PlainMatrix VT(vtrows, n_);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT.data(), d_VT_.ptr,
+                                          static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
+                                          cudaMemcpyDeviceToHost));
+      return VT;
+    } else {
+      // Transposed: VT_orig = U_stored^H. U_stored is m_ × ucols (= n_orig × ucols).
+      const Index ucols = (options_ & ComputeFullV) ? n_orig : k;  // FullV maps to FullU of A^H
+      PlainMatrix U_stored(m_, ucols);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U_stored.data(), d_U_.ptr,
+                                          static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
+                                          cudaMemcpyDeviceToHost));
+      return U_stored.adjoint();  // ucols × n_orig
+    }
+  }
+
+  /** Number of singular values above threshold. */
+  Index rank(RealScalar threshold = RealScalar(-1)) const {
+    RealVector S = singularValues();
+    if (S.size() == 0) return 0;
+    if (threshold < 0) {
+      threshold = (std::max)(m_, n_) * S(0) * NumTraits<RealScalar>::epsilon();
+    }
+    Index r = 0;
+    for (Index i = 0; i < S.size(); ++i) {
+      if (S(i) > threshold) ++r;
+    }
+    return r;
+  }
+
+  // ---- Solve ---------------------------------------------------------------
+
+  /** Pseudoinverse solve: X = V * diag(1/S) * U^H * B. */
+  template <typename Rhs>
+  PlainMatrix solve(const MatrixBase<Rhs>& B) const {
+    return solve_impl(B, (std::min)(m_, n_), RealScalar(0));
+  }
+
+  /** Truncated solve: use only top trunc singular triplets. */
+  template <typename Rhs>
+  PlainMatrix solve(const MatrixBase<Rhs>& B, Index trunc) const {
+    eigen_assert(trunc > 0 && trunc <= (std::min)(m_, n_));
+    return solve_impl(B, trunc, RealScalar(0));
+  }
+
+  /** Tikhonov-regularized solve: D_ii = S_i / (S_i^2 + lambda^2). */
+  template <typename Rhs>
+  PlainMatrix solve(const MatrixBase<Rhs>& B, RealScalar lambda) const {
+    eigen_assert(lambda > 0);
+    return solve_impl(B, (std::min)(m_, n_), lambda);
+  }
+
+  cudaStream_t stream() const { return stream_; }
+
+ private:
+  cudaStream_t stream_ = nullptr;
+  cusolverDnHandle_t handle_ = nullptr;
+  cublasHandle_t cublas_ = nullptr;
+  internal::CusolverParams params_;
+  internal::DeviceBuffer d_A_;        // working copy of A (overwritten by gesvd)
+  internal::DeviceBuffer d_U_;        // left singular vectors
+  internal::DeviceBuffer d_S_;        // singular values (RealScalar)
+  internal::DeviceBuffer d_VT_;       // right singular vectors transposed
+  internal::DeviceBuffer d_scratch_;  // workspace + info
+  size_t scratch_size_ = 0;
+  std::vector<char> h_workspace_;
+  unsigned int options_ = 0;
+  Index m_ = 0;
+  Index n_ = 0;
+  int64_t lda_ = 0;
+  bool transposed_ = false;  // true if m < n (we compute SVD of A^T internally)
+  ComputationInfo info_ = InvalidInput;
+  int info_word_ = 0;
+  bool info_synced_ = true;
+
+  void init_context() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
+    EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
+    EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
+    ensure_scratch(0);
+  }
+
+  void ensure_scratch(size_t workspace_bytes) {
+    constexpr size_t kAlign = 16;
+    workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
+    size_t needed = workspace_bytes + sizeof(int);
+    if (needed > scratch_size_) {
+      if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      d_scratch_ = internal::DeviceBuffer(needed);
+      scratch_size_ = needed;
+    }
+  }
+
+  void* scratch_workspace() const { return d_scratch_.ptr; }
+  int* scratch_info() const {
+    return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
+  }
+
+  void sync_info() const {
+    if (!info_synced_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      const_cast<GpuSVD*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
+      const_cast<GpuSVD*>(this)->info_synced_ = true;
+    }
+  }
+
+  // Swap U↔V flags for the transposed case.
+  static unsigned int swap_uv_options(unsigned int opts) {
+    unsigned int result = 0;
+    if (opts & ComputeThinU) result |= ComputeThinV;
+    if (opts & ComputeFullU) result |= ComputeFullV;
+    if (opts & ComputeThinV) result |= ComputeThinU;
+    if (opts & ComputeFullV) result |= ComputeFullU;
+    return result;
+  }
+
+  static signed char jobu(unsigned int opts) {
+    if (opts & ComputeFullU) return 'A';
+    if (opts & ComputeThinU) return 'S';
+    return 'N';
+  }
+
+  static signed char jobvt(unsigned int opts) {
+    if (opts & ComputeFullV) return 'A';
+    if (opts & ComputeThinV) return 'S';
+    return 'N';
+  }
+
+  void factorize() {
+    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
+    constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
+    const Index k = (std::min)(m_, n_);
+
+    info_synced_ = false;
+    info_ = InvalidInput;
+
+    // Allocate output buffers. When transposed, swap U/V roles for cuSOLVER.
+    d_S_ = internal::DeviceBuffer(static_cast<size_t>(k) * sizeof(RealScalar));
+
+    // Internal options: for transposed case, what user wants as U we compute as VT of A^H.
+    const unsigned int int_opts = transposed_ ? swap_uv_options(options_) : options_;
+
+    const Index ucols = (int_opts & ComputeFullU) ? m_ : ((int_opts & ComputeThinU) ? k : 0);
+    const Index vtrows = (int_opts & ComputeFullV) ? n_ : ((int_opts & ComputeThinV) ? k : 0);
+    const int64_t ldu = m_;
+    const int64_t ldvt = vtrows > 0 ? vtrows : 1;
+
+    if (ucols > 0) d_U_ = internal::DeviceBuffer(static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar));
+    if (vtrows > 0)
+      d_VT_ = internal::DeviceBuffer(static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar));
+
+    // computeType must match the matrix data type (dtype), not the singular value type (rtype).
+    eigen_assert(m_ >= n_ && "Internal error: m_ < n_ should have been handled by transpose in compute()");
+    size_t dev_ws = 0, host_ws = 0;
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd_bufferSize(
+        handle_, params_.p, jobu(int_opts), jobvt(int_opts), static_cast<int64_t>(m_), static_cast<int64_t>(n_), dtype,
+        d_A_.ptr, lda_, rtype, d_S_.ptr, dtype, ucols > 0 ? d_U_.ptr : nullptr, ldu, dtype,
+        vtrows > 0 ? d_VT_.ptr : nullptr, ldvt, dtype, &dev_ws, &host_ws));
+
+    ensure_scratch(dev_ws);
+    h_workspace_.resize(host_ws);
+
+    // Compute SVD.
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd(handle_, params_.p, jobu(int_opts), jobvt(int_opts), static_cast<int64_t>(m_),
+                                          static_cast<int64_t>(n_), dtype, d_A_.ptr, lda_, rtype, d_S_.ptr, dtype,
+                                          ucols > 0 ? d_U_.ptr : nullptr, ldu, dtype, vtrows > 0 ? d_VT_.ptr : nullptr,
+                                          ldvt, dtype, scratch_workspace(), dev_ws,
+                                          host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
+
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
+  }
+
+  // Internal solve: X = V * diag(D) * U^H * B, using top `trunc` triplets.
+  // D_ii = 1/S_i (if lambda==0) or S_i/(S_i^2+lambda^2).
+  //
+  // For non-transposed: stored U, VT. X = VT^H * D * U^H * B.
+  // For transposed (SVD of A^H): stored U', VT'. X = U' * D * VT' * B.
+  template <typename Rhs>
+  PlainMatrix solve_impl(const MatrixBase<Rhs>& B, Index trunc, RealScalar lambda) const {
+    sync_info();
+    eigen_assert(info_ == Success && "GpuSVD::solve called on a failed or uninitialized decomposition");
+    eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "solve requires U");
+    eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "solve requires V");
+
+    const Index m_orig = transposed_ ? n_ : m_;
+    const Index n_orig = transposed_ ? m_ : n_;
+    eigen_assert(B.rows() == m_orig);
+
+    const Index k = (std::min)(m_, n_);  // = min(m_orig, n_orig)
+    const Index kk = (std::min)(trunc, k);
+    const Index nrhs = B.cols();
+
+    // Download S to host to build the diagonal scaling.
+    RealVector S(k);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpy(S.data(), d_S_.ptr, static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+
+    // Upload B (m_orig × nrhs).
+    const PlainMatrix rhs(B);
+    internal::DeviceBuffer d_B(static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.ptr, rhs.data(),
+                                             static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
+                                             cudaMemcpyHostToDevice, stream_));
+
+    // Step 1: tmp = U_orig^H * B  (kk × nrhs).
+    // Non-transposed: U_stored is m_×ucols, U_orig = U_stored. Use U_stored^H * B.
+    // Transposed: U_orig = VT_stored^H, so U_orig^H = VT_stored. Use VT_stored * B (no transpose!).
+    internal::DeviceBuffer d_tmp(static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar));
+    {
+      Scalar alpha_one(1), beta_zero(0);
+      constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
+      constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;
+
+      if (!transposed_) {
+        // U_stored^H * B: (m_×kk)^H × (m_×nrhs) → kk×nrhs.
+        EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs),
+                                        static_cast<int>(m_), &alpha_one, d_U_.ptr, dtype, static_cast<int>(m_),
+                                        d_B.ptr, dtype, static_cast<int>(m_orig), &beta_zero, d_tmp.ptr, dtype,
+                                        static_cast<int>(kk), compute, internal::cuda_gemm_algo()));
+      } else {
+        // VT_stored * B: VT_stored is vtrows×n_ = kk×m_orig (thin), NoTrans.
+        // vtrows×m_orig times m_orig×nrhs → vtrows×nrhs. Use first kk rows.
+        const Index vtrows_stored = (swap_uv_options(options_) & ComputeFullV) ? n_ : k;
+        EIGEN_CUBLAS_CHECK(cublasGemmEx(
+            cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs), static_cast<int>(m_orig),
+            &alpha_one, d_VT_.ptr, dtype, static_cast<int>(vtrows_stored), d_B.ptr, dtype, static_cast<int>(m_orig),
+            &beta_zero, d_tmp.ptr, dtype, static_cast<int>(kk), compute, internal::cuda_gemm_algo()));
+      }
+    }
+
+    // Step 2: Scale row i of tmp by D_ii.
+    // Download tmp to host, scale, re-upload. (Simple and correct; a device kernel would be faster.)
+    {
+      PlainMatrix tmp(kk, nrhs);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(tmp.data(), d_tmp.ptr,
+                                               static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar),
+                                               cudaMemcpyDeviceToHost, stream_));
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+
+      for (Index i = 0; i < kk; ++i) {
+        RealScalar si = S(i);
+        RealScalar di = (lambda == RealScalar(0)) ? (si > 0 ? RealScalar(1) / si : RealScalar(0))
+                                                  : si / (si * si + lambda * lambda);
+        tmp.row(i) *= Scalar(di);
+      }
+
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_tmp.ptr, tmp.data(),
+                                               static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar),
+                                               cudaMemcpyHostToDevice, stream_));
+    }
+
+    // Step 3: X = V_orig * tmp  (n_orig × nrhs).
+    // Non-transposed: V_orig = VT_stored^H. VT_stored[:kk,:]^H * tmp → n_orig × nrhs.
+    // Transposed: V_orig = U_stored[:,:kk]. U_stored * tmp → n_orig × nrhs (NoTrans).
+    PlainMatrix X(n_orig, nrhs);
+    {
+      internal::DeviceBuffer d_X(static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
+      Scalar alpha_one(1), beta_zero(0);
+      constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
+      constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;
+
+      if (!transposed_) {
+        const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
+        EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(n_orig),
+                                        static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_VT_.ptr, dtype,
+                                        static_cast<int>(vtrows), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero,
+                                        d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo()));
+      } else {
+        // U_stored is m_×ucols. V_orig = U_stored[:,:kk]. NoTrans × tmp.
+        EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(n_orig),
+                                        static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_U_.ptr, dtype,
+                                        static_cast<int>(m_), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero,
+                                        d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo()));
+      }
+
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_X.ptr,
+                                               static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
+                                               cudaMemcpyDeviceToHost, stream_));
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    }
+
+    return X;
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SVD_H
--- a/Eigen/src/GPU/GpuSparseContext.h
+++ b/Eigen/src/GPU/GpuSparseContext.h
@@ -0,0 +1,321 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix
+// multiply (SpMM) via cuSPARSE.
+//
+// GpuSparseContext manages a cuSPARSE handle and device buffers. It accepts
+// Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the
+// GPU. RowMajor input is implicitly converted to ColMajor.
+//
+// Usage:
+//   GpuSparseContext<double> ctx;
+//   VectorXd y = ctx.multiply(A, x);           // y = A * x
+//   ctx.multiply(A, x, y, 2.0, 1.0);           // y = 2*A*x + y
+//   VectorXd z = ctx.multiplyT(A, x);          // z = A^T * x
+//   MatrixXd Y = ctx.multiplyMat(A, X);        // Y = A * X (multiple RHS)
+
+#ifndef EIGEN_GPU_SPARSE_CONTEXT_H
+#define EIGEN_GPU_SPARSE_CONTEXT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuSparseSupport.h"
+
+namespace Eigen {
+
+template <typename Scalar_>
+class GpuSparseContext {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using StorageIndex = int;
+  using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
+  using DenseVector = Matrix<Scalar, Dynamic, 1>;
+  using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+
+  GpuSparseContext() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_));
+    EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_));
+  }
+
+  ~GpuSparseContext() {
+    destroy_descriptors();
+    if (handle_) (void)cusparseDestroy(handle_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  GpuSparseContext(const GpuSparseContext&) = delete;
+  GpuSparseContext& operator=(const GpuSparseContext&) = delete;
+
+  // ---- SpMV: y = A * x -----------------------------------------------------
+
+  /** Compute y = A * x. Returns y as a new dense vector. */
+  template <typename InputType, typename Rhs>
+  DenseVector multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
+    const SpMat mat(A.derived());
+    DenseVector y(mat.rows());
+    y.setZero();
+    multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
+    return y;
+  }
+
+  /** Compute y = alpha * op(A) * x + beta * y (in-place). */
+  template <typename InputType, typename Rhs, typename Dest>
+  void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y,
+                Scalar alpha = Scalar(1), Scalar beta = Scalar(0),
+                cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
+    const SpMat mat(A.derived());
+    multiply_impl(mat, x.derived(), y.derived(), alpha, beta, op);
+  }
+
+  // ---- SpMV transpose: y = A^T * x -----------------------------------------
+
+  /** Compute y = A^T * x. Returns y as a new dense vector. */
+  template <typename InputType, typename Rhs>
+  DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
+    const SpMat mat(A.derived());
+    DenseVector y(mat.cols());
+    y.setZero();
+    multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
+    return y;
+  }
+
+  // ---- SpMM: Y = A * X (multiple RHS) --------------------------------------
+
+  /** Compute Y = A * X where X is a dense matrix (multiple RHS). Returns Y. */
+  template <typename InputType, typename Rhs>
+  DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X) {
+    const SpMat mat(A.derived());
+    const DenseMatrix rhs(X.derived());
+    eigen_assert(mat.cols() == rhs.rows());
+
+    const Index m = mat.rows();
+    const Index n = rhs.cols();
+    if (m == 0 || n == 0 || mat.nonZeros() == 0) return DenseMatrix::Zero(m, n);
+
+    DenseMatrix Y = DenseMatrix::Zero(m, n);
+    spmm_impl(mat, rhs, Y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
+    return Y;
+  }
+
+  // ---- Accessors ------------------------------------------------------------
+
+  cudaStream_t stream() const { return stream_; }
+
+ private:
+  cudaStream_t stream_ = nullptr;
+  cusparseHandle_t handle_ = nullptr;
+
+  // Cached device buffers (grow-only).
+  internal::DeviceBuffer d_outerPtr_;
+  internal::DeviceBuffer d_innerIdx_;
+  internal::DeviceBuffer d_values_;
+  internal::DeviceBuffer d_x_;
+  internal::DeviceBuffer d_y_;
+  internal::DeviceBuffer d_workspace_;
+  size_t d_outerPtr_size_ = 0;
+  size_t d_innerIdx_size_ = 0;
+  size_t d_values_size_ = 0;
+  size_t d_x_size_ = 0;
+  size_t d_y_size_ = 0;
+  size_t d_workspace_size_ = 0;
+
+  // Cached cuSPARSE descriptors.
+  cusparseSpMatDescr_t spmat_desc_ = nullptr;
+  Index cached_rows_ = -1;
+  Index cached_cols_ = -1;
+  Index cached_nnz_ = -1;
+
+  // ---- SpMV implementation --------------------------------------------------
+
+  template <typename RhsDerived, typename DestDerived>
+  void multiply_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
+                     cusparseOperation_t op) {
+    eigen_assert(A.isCompressed());
+
+    const Index m = A.rows();
+    const Index n = A.cols();
+    const Index nnz = A.nonZeros();
+    const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
+    const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
+
+    eigen_assert(x.size() == x_size);
+    eigen_assert(y.size() == y_size);
+
+    if (m == 0 || n == 0 || nnz == 0) {
+      if (beta == Scalar(0))
+        y.setZero();
+      else
+        y *= beta;
+      return;
+    }
+
+    // Upload sparse matrix to device.
+    upload_sparse(A);
+
+    // Upload x to device.
+    ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar));
+    const DenseVector x_tmp(x);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_x_.ptr, x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
+
+    // Upload y to device (for beta != 0).
+    ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar));
+    if (beta != Scalar(0)) {
+      const DenseVector y_tmp(y);
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpyAsync(d_y_.ptr, y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
+    }
+
+    // Create dense vector descriptors.
+    constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
+    cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr;
+    EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_.ptr, dtype));
+    EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_.ptr, dtype));
+
+    // Query workspace size.
+    size_t ws_size = 0;
+    EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
+                                                 CUSPARSE_SPMV_ALG_DEFAULT, &ws_size));
+    ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
+
+    // Execute SpMV.
+    EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
+                                      CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.ptr));
+
+    // Download result.
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(y.data(), d_y_.ptr, y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+
+    (void)cusparseDestroyDnVec(x_desc);
+    (void)cusparseDestroyDnVec(y_desc);
+  }
+
+  // ---- SpMM implementation --------------------------------------------------
+
+  void spmm_impl(const SpMat& A, const DenseMatrix& X, DenseMatrix& Y, Scalar alpha, Scalar beta,
+                 cusparseOperation_t op) {
+    eigen_assert(A.isCompressed());
+
+    const Index m = A.rows();
+    const Index n = X.cols();
+    const Index k = A.cols();
+    const Index nnz = A.nonZeros();
+
+    if (m == 0 || n == 0 || k == 0 || nnz == 0) {
+      if (beta == Scalar(0))
+        Y.setZero();
+      else
+        Y *= beta;
+      return;
+    }
+
+    upload_sparse(A);
+
+    // Upload X to device.
+    const size_t x_bytes = static_cast<size_t>(k) * static_cast<size_t>(n) * sizeof(Scalar);
+    const size_t y_bytes = static_cast<size_t>(m) * static_cast<size_t>(n) * sizeof(Scalar);
+    ensure_buffer(d_x_, d_x_size_, x_bytes);
+    ensure_buffer(d_y_, d_y_size_, y_bytes);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_x_.ptr, X.data(), x_bytes, cudaMemcpyHostToDevice, stream_));
+    if (beta != Scalar(0)) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.ptr, Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_));
+    }
+
+    // Create dense matrix descriptors.
+    constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
+    cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr;
+    // Eigen is column-major, so ld = rows.
+    EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k, n, k, d_x_.ptr, dtype, CUSPARSE_ORDER_COL));
+    EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m, n, m, d_y_.ptr, dtype, CUSPARSE_ORDER_COL));
+
+    // Query workspace.
+    size_t ws_size = 0;
+    EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_,
+                                                 x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size));
+    ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
+
+    // Execute SpMM.
+    EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta,
+                                      y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.ptr));
+
+    // Download result.
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.ptr, y_bytes, cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+
+    (void)cusparseDestroyDnMat(x_desc);
+    (void)cusparseDestroyDnMat(y_desc);
+  }
+
+  // ---- Helpers --------------------------------------------------------------
+
+  void upload_sparse(const SpMat& A) {
+    const Index m = A.rows();
+    const Index n = A.cols();
+    const Index nnz = A.nonZeros();
+
+    const size_t outer_bytes = static_cast<size_t>(n + 1) * sizeof(StorageIndex);
+    const size_t inner_bytes = static_cast<size_t>(nnz) * sizeof(StorageIndex);
+    const size_t val_bytes = static_cast<size_t>(nnz) * sizeof(Scalar);
+
+    ensure_buffer(d_outerPtr_, d_outerPtr_size_, outer_bytes);
+    ensure_buffer(d_innerIdx_, d_innerIdx_size_, inner_bytes);
+    ensure_buffer(d_values_, d_values_size_, val_bytes);
+
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_outerPtr_.ptr, A.outerIndexPtr(), outer_bytes, cudaMemcpyHostToDevice, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_innerIdx_.ptr, A.innerIndexPtr(), inner_bytes, cudaMemcpyHostToDevice, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, A.valuePtr(), val_bytes, cudaMemcpyHostToDevice, stream_));
+
+    // Recreate descriptor if shape changed.
+    if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) {
+      destroy_descriptors();
+
+      constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
+      constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value;
+
+      // ColMajor → CSC. outerIndexPtr = col offsets, innerIndexPtr = row indices.
+      EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr,
+                                             idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO, val_type));
+      cached_rows_ = m;
+      cached_cols_ = n;
+      cached_nnz_ = nnz;
+    } else {
+      // Same shape — just update pointers.
+      EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr));
+    }
+  }
+
+  void destroy_descriptors() {
+    if (spmat_desc_) {
+      (void)cusparseDestroySpMat(spmat_desc_);
+      spmat_desc_ = nullptr;
+    }
+    cached_rows_ = -1;
+    cached_cols_ = -1;
+    cached_nnz_ = -1;
+  }
+
+  void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) {
+    if (needed > current_size) {
+      if (buf.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      buf = internal::DeviceBuffer(needed);
+      current_size = needed;
+    }
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SPARSE_CONTEXT_H
--- a/Eigen/src/GPU/GpuSparseLDLT.h
+++ b/Eigen/src/GPU/GpuSparseLDLT.h
@@ -0,0 +1,62 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// GPU sparse LDL^T / LDL^H factorization via cuDSS.
+//
+// For symmetric indefinite (or Hermitian indefinite) sparse matrices.
+// Same three-phase workflow as GpuSparseLLT.
+//
+// Usage:
+//   GpuSparseLDLT<double> ldlt(A);      // analyze + factorize
+//   VectorXd x = ldlt.solve(b);         // solve
+
+#ifndef EIGEN_GPU_SPARSE_LDLT_H
+#define EIGEN_GPU_SPARSE_LDLT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSparseSolverBase.h"
+
+namespace Eigen {
+
+/** GPU sparse LDL^T factorization (symmetric indefinite / Hermitian indefinite).
+ *
+ * Wraps cuDSS with CUDSS_MTYPE_SYMMETRIC (real) or CUDSS_MTYPE_HERMITIAN (complex).
+ * Uses pivoting for numerical stability.
+ *
+ * \tparam Scalar_  float, double, complex<float>, or complex<double>
+ * \tparam UpLo_    Lower (default) or Upper — which triangle of A is stored
+ */
+template <typename Scalar_, int UpLo_ = Lower>
+class GpuSparseLDLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT<Scalar_, UpLo_>> {
+  using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT>;
+  friend Base;
+
+ public:
+  using Scalar = Scalar_;
+  enum { UpLo = UpLo_ };
+
+  GpuSparseLDLT() = default;
+
+  template <typename InputType>
+  explicit GpuSparseLDLT(const SparseMatrixBase<InputType>& A) {
+    this->compute(A);
+  }
+
+  static constexpr bool needs_csr_conversion() { return false; }
+  static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_symmetric_type<Scalar>::value; }
+  static constexpr cudssMatrixViewType_t cudss_matrix_view() {
+    return internal::cudss_view_type<UpLo, ColMajor>::value;
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SPARSE_LDLT_H
--- a/Eigen/src/GPU/GpuSparseLLT.h
+++ b/Eigen/src/GPU/GpuSparseLLT.h
@@ -0,0 +1,62 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// GPU sparse Cholesky (LL^T / LL^H) via cuDSS.
+//
+// Usage:
+//   GpuSparseLLT<double> llt(A);        // analyze + factorize
+//   VectorXd x = llt.solve(b);          // solve
+//   llt.analyzePattern(A);              // or separate phases
+//   llt.factorize(A_new);              // reuse symbolic analysis
+
+#ifndef EIGEN_GPU_SPARSE_LLT_H
+#define EIGEN_GPU_SPARSE_LLT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSparseSolverBase.h"
+
+namespace Eigen {
+
+/** GPU sparse Cholesky factorization (LL^T for real, LL^H for complex).
+ *
+ * Wraps cuDSS with CUDSS_MTYPE_SPD (real) or CUDSS_MTYPE_HPD (complex).
+ * Accepts ColMajor SparseMatrix (CSC), reinterpreted as CSR with swapped
+ * triangle view for zero-copy upload.
+ *
+ * \tparam Scalar_  float, double, complex<float>, or complex<double>
+ * \tparam UpLo_    Lower (default) or Upper — which triangle of A is stored
+ */
+template <typename Scalar_, int UpLo_ = Lower>
+class GpuSparseLLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT<Scalar_, UpLo_>> {
+  using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT>;
+  friend Base;
+
+ public:
+  using Scalar = Scalar_;
+  enum { UpLo = UpLo_ };
+
+  GpuSparseLLT() = default;
+
+  template <typename InputType>
+  explicit GpuSparseLLT(const SparseMatrixBase<InputType>& A) {
+    this->compute(A);
+  }
+
+  static constexpr bool needs_csr_conversion() { return false; }
+  static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_spd_type<Scalar>::value; }
+  static constexpr cudssMatrixViewType_t cudss_matrix_view() {
+    return internal::cudss_view_type<UpLo, ColMajor>::value;
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SPARSE_LLT_H
--- a/Eigen/src/GPU/GpuSparseLU.h
+++ b/Eigen/src/GPU/GpuSparseLU.h
@@ -0,0 +1,59 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// GPU sparse LU factorization via cuDSS.
+//
+// For general (non-symmetric) sparse matrices. Uses pivoting.
+// Same three-phase workflow as GpuSparseLLT.
+//
+// Usage:
+//   GpuSparseLU<double> lu(A);          // analyze + factorize
+//   VectorXd x = lu.solve(b);           // solve
+
+#ifndef EIGEN_GPU_SPARSE_LU_H
+#define EIGEN_GPU_SPARSE_LU_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSparseSolverBase.h"
+
+namespace Eigen {
+
+/** GPU sparse LU factorization (general matrices).
+ *
+ * Wraps cuDSS with CUDSS_MTYPE_GENERAL and CUDSS_MVIEW_FULL.
+ * Accepts ColMajor SparseMatrix (CSC); internally converts to RowMajor
+ * CSR since cuDSS requires CSR input.
+ *
+ * \tparam Scalar_  float, double, complex<float>, or complex<double>
+ */
+template <typename Scalar_>
+class GpuSparseLU : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLU<Scalar_>> {
+  using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLU>;
+  friend Base;
+
+ public:
+  using Scalar = Scalar_;
+
+  GpuSparseLU() = default;
+
+  template <typename InputType>
+  explicit GpuSparseLU(const SparseMatrixBase<InputType>& A) {
+    this->compute(A);
+  }
+
+  static constexpr bool needs_csr_conversion() { return true; }
+  static constexpr cudssMatrixType_t cudss_matrix_type() { return CUDSS_MTYPE_GENERAL; }
+  static constexpr cudssMatrixViewType_t cudss_matrix_view() { return CUDSS_MVIEW_FULL; }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SPARSE_LU_H
--- a/Eigen/src/GPU/GpuSparseSolverBase.h
+++ b/Eigen/src/GPU/GpuSparseSolverBase.h
@@ -0,0 +1,356 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Common base for GPU sparse direct solvers (LLT, LDLT, LU) via cuDSS.
+//
+// All three solver types share the same three-phase workflow
+// (analyzePattern → factorize → solve) and differ only in the
+// cudssMatrixType_t and cudssMatrixViewType_t passed to cuDSS.
+// This CRTP base implements the entire workflow; derived classes
+// provide the matrix type/view via static constexpr members.
+
+#ifndef EIGEN_GPU_SPARSE_SOLVER_BASE_H
+#define EIGEN_GPU_SPARSE_SOLVER_BASE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuDssSupport.h"
+
+namespace Eigen {
+namespace internal {
+
+/** CRTP base for GPU sparse direct solvers.
+ *
+ * \tparam Scalar_  Element type (passed explicitly to avoid incomplete-type issues with CRTP).
+ * \tparam Derived  The concrete solver class (GpuSparseLLT, GpuSparseLDLT, GpuSparseLU).
+ *                  Must provide:
+ *                  - `static constexpr cudssMatrixType_t cudss_matrix_type()`
+ *                  - `static constexpr cudssMatrixViewType_t cudss_matrix_view()`
+ */
+template <typename Scalar_, typename Derived>
+class GpuSparseSolverBase {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using StorageIndex = int;
+  using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
+  using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>;
+  using DenseVector = Matrix<Scalar, Dynamic, 1>;
+  using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+
+  GpuSparseSolverBase() { init_context(); }
+
+  ~GpuSparseSolverBase() {
+    destroy_cudss_objects();
+    if (handle_) (void)cudssDestroy(handle_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  GpuSparseSolverBase(const GpuSparseSolverBase&) = delete;
+  GpuSparseSolverBase& operator=(const GpuSparseSolverBase&) = delete;
+
+  // ---- Configuration --------------------------------------------------------
+
+  /** Set the fill-reducing ordering algorithm. Must be called before compute/analyzePattern. */
+  void setOrdering(GpuSparseOrdering ordering) { ordering_ = ordering; }
+
+  // ---- Factorization --------------------------------------------------------
+
+  /** Symbolic analysis + numeric factorization. */
+  template <typename InputType>
+  Derived& compute(const SparseMatrixBase<InputType>& A) {
+    analyzePattern(A);
+    if (info_ == Success) {
+      factorize(A);
+    }
+    return derived();
+  }
+
+  /** Symbolic analysis only. Uploads sparsity structure to device.
+   * This phase is synchronous (blocks until complete). */
+  template <typename InputType>
+  Derived& analyzePattern(const SparseMatrixBase<InputType>& A) {
+    const SpMat csc(A.derived());
+    eigen_assert(csc.rows() == csc.cols() && "GpuSparseSolver requires a square matrix");
+    eigen_assert(csc.isCompressed() && "GpuSparseSolver requires a compressed sparse matrix");
+
+    n_ = csc.rows();
+    info_ = InvalidInput;
+    analysis_done_ = false;
+
+    if (n_ == 0) {
+      nnz_ = 0;
+      info_ = Success;
+      analysis_done_ = true;
+      return derived();
+    }
+
+    // For symmetric solvers, ColMajor CSC can be reinterpreted as CSR with
+    // swapped triangle view (zero copy). For general solvers, we must convert
+    // to actual RowMajor CSR so cuDSS sees the correct matrix, not A^T.
+    if (Derived::needs_csr_conversion()) {
+      const CsrMat csr(csc);
+      nnz_ = csr.nonZeros();
+      upload_csr(csr);
+    } else {
+      nnz_ = csc.nonZeros();
+      upload_csr_from_csc(csc);
+    }
+    create_cudss_matrix();
+    apply_ordering_config();
+
+    if (data_) EIGEN_CUDSS_CHECK(cudssDataDestroy(handle_, data_));
+    EIGEN_CUDSS_CHECK(cudssDataCreate(handle_, &data_));
+
+    create_placeholder_dense();
+
+    EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_ANALYSIS, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
+
+    analysis_done_ = true;
+    info_ = Success;
+    return derived();
+  }
+
+  /** Numeric factorization using the symbolic analysis from analyzePattern.
+   *
+   * \warning The sparsity pattern (outerIndexPtr, innerIndexPtr) must be
+   * identical to the one passed to analyzePattern(). Only the numerical
+   * values may change. Passing a different pattern is undefined behavior.
+   * This matches the contract of CHOLMOD, UMFPACK, and cuDSS's own API.
+   *
+   * This phase is asynchronous — info() lazily synchronizes. */
+  template <typename InputType>
+  Derived& factorize(const SparseMatrixBase<InputType>& A) {
+    eigen_assert(analysis_done_ && "factorize() requires analyzePattern() first");
+
+    if (n_ == 0) {
+      info_ = Success;
+      return derived();
+    }
+
+    // Convert to the same format used in analyzePattern.
+    // Both temporaries must outlive the async memcpy (pageable H2D is actually
+    // synchronous w.r.t. the host, but keep them alive for clarity).
+    const SpMat csc(A.derived());
+    eigen_assert(csc.rows() == n_ && csc.cols() == n_);
+
+    const Scalar* value_ptr;
+    Index value_nnz;
+    CsrMat csr_tmp;
+    if (Derived::needs_csr_conversion()) {
+      csr_tmp = CsrMat(csc);
+      value_ptr = csr_tmp.valuePtr();
+      value_nnz = csr_tmp.nonZeros();
+    } else {
+      value_ptr = csc.valuePtr();
+      value_nnz = csc.nonZeros();
+    }
+    eigen_assert(value_nnz == nnz_);
+
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, value_ptr, static_cast<size_t>(nnz_) * sizeof(Scalar),
+                                             cudaMemcpyHostToDevice, stream_));
+
+    EIGEN_CUDSS_CHECK(cudssMatrixSetValues(d_A_cudss_, d_values_.ptr));
+
+    info_ = InvalidInput;
+    info_synced_ = false;
+    EIGEN_CUDSS_CHECK(
+        cudssExecute(handle_, CUDSS_PHASE_FACTORIZATION, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
+
+    return derived();
+  }
+
+  // ---- Solve ----------------------------------------------------------------
+
+  /** Solve A * X = B. Returns X as a dense matrix.
+   * Supports single or multiple right-hand sides. */
+  template <typename Rhs>
+  DenseMatrix solve(const MatrixBase<Rhs>& B) const {
+    sync_info();
+    eigen_assert(info_ == Success && "GpuSparseSolver::solve requires a successful factorization");
+    eigen_assert(B.rows() == n_);
+
+    const DenseMatrix rhs(B);
+    const int64_t nrhs = static_cast<int64_t>(rhs.cols());
+
+    if (n_ == 0) return DenseMatrix(0, rhs.cols());
+
+    const size_t rhs_bytes = static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+    DeviceBuffer d_b(rhs_bytes);
+    DeviceBuffer d_x(rhs_bytes);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_b.ptr, rhs.data(), rhs_bytes, cudaMemcpyHostToDevice, stream_));
+
+    constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
+    cudssMatrix_t b_cudss = nullptr, x_cudss = nullptr;
+    EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&b_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_b.ptr,
+                                          dtype, CUDSS_LAYOUT_COL_MAJOR));
+    EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&x_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_x.ptr,
+                                          dtype, CUDSS_LAYOUT_COL_MAJOR));
+
+    EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_SOLVE, config_, data_, d_A_cudss_, x_cudss, b_cudss));
+
+    DenseMatrix X(n_, rhs.cols());
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_x.ptr, rhs_bytes, cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+
+    (void)cudssMatrixDestroy(b_cudss);
+    (void)cudssMatrixDestroy(x_cudss);
+
+    return X;
+  }
+
+  // ---- Accessors ------------------------------------------------------------
+
+  ComputationInfo info() const {
+    sync_info();
+    return info_;
+  }
+  Index rows() const { return n_; }
+  Index cols() const { return n_; }
+
+  cudaStream_t stream() const { return stream_; }
+
+ protected:
+  // ---- CUDA / cuDSS handles -------------------------------------------------
+  cudaStream_t stream_ = nullptr;
+  cudssHandle_t handle_ = nullptr;
+  cudssConfig_t config_ = nullptr;
+  cudssData_t data_ = nullptr;
+  cudssMatrix_t d_A_cudss_ = nullptr;
+  cudssMatrix_t d_x_cudss_ = nullptr;
+  cudssMatrix_t d_b_cudss_ = nullptr;
+
+  // ---- Device buffers for CSR arrays ----------------------------------------
+  DeviceBuffer d_rowPtr_;
+  DeviceBuffer d_colIdx_;
+  DeviceBuffer d_values_;
+
+  // ---- State ----------------------------------------------------------------
+  Index n_ = 0;
+  Index nnz_ = 0;
+  ComputationInfo info_ = InvalidInput;
+  bool info_synced_ = true;
+  bool analysis_done_ = false;
+  GpuSparseOrdering ordering_ = GpuSparseOrdering::AMD;
+
+ private:
+  Derived& derived() { return static_cast<Derived&>(*this); }
+  const Derived& derived() const { return static_cast<const Derived&>(*this); }
+
+  void init_context() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUDSS_CHECK(cudssCreate(&handle_));
+    EIGEN_CUDSS_CHECK(cudssSetStream(handle_, stream_));
+    EIGEN_CUDSS_CHECK(cudssConfigCreate(&config_));
+  }
+
+  void sync_info() const {
+    if (!info_synced_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      int cudss_info = 0;
+      EIGEN_CUDSS_CHECK(cudssDataGet(handle_, data_, CUDSS_DATA_INFO, &cudss_info, sizeof(cudss_info), nullptr));
+      auto* self = const_cast<GpuSparseSolverBase*>(this);
+      self->info_ = (cudss_info == 0) ? Success : NumericalIssue;
+      self->info_synced_ = true;
+    }
+  }
+
+  void destroy_cudss_objects() {
+    if (d_A_cudss_) {
+      (void)cudssMatrixDestroy(d_A_cudss_);
+      d_A_cudss_ = nullptr;
+    }
+    if (d_x_cudss_) {
+      (void)cudssMatrixDestroy(d_x_cudss_);
+      d_x_cudss_ = nullptr;
+    }
+    if (d_b_cudss_) {
+      (void)cudssMatrixDestroy(d_b_cudss_);
+      d_b_cudss_ = nullptr;
+    }
+    if (data_) {
+      (void)cudssDataDestroy(handle_, data_);
+      data_ = nullptr;
+    }
+    if (config_) {
+      (void)cudssConfigDestroy(config_);
+      config_ = nullptr;
+    }
+  }
+
+  // Upload CSR from a RowMajor sparse matrix (native CSR).
+  void upload_csr(const CsrMat& csr) { upload_compressed(csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr()); }
+
+  // Upload CSC arrays reinterpreted as CSR (for symmetric matrices: CSC(A) = CSR(A^T) = CSR(A)).
+  void upload_csr_from_csc(const SpMat& csc) {
+    upload_compressed(csc.outerIndexPtr(), csc.innerIndexPtr(), csc.valuePtr());
+  }
+
+  void upload_compressed(const StorageIndex* outer, const StorageIndex* inner, const Scalar* values) {
+    const size_t rowptr_bytes = static_cast<size_t>(n_ + 1) * sizeof(StorageIndex);
+    const size_t colidx_bytes = static_cast<size_t>(nnz_) * sizeof(StorageIndex);
+    const size_t values_bytes = static_cast<size_t>(nnz_) * sizeof(Scalar);
+
+    d_rowPtr_ = DeviceBuffer(rowptr_bytes);
+    d_colIdx_ = DeviceBuffer(colidx_bytes);
+    d_values_ = DeviceBuffer(values_bytes);
+
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_rowPtr_.ptr, outer, rowptr_bytes, cudaMemcpyHostToDevice, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_colIdx_.ptr, inner, colidx_bytes, cudaMemcpyHostToDevice, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, values, values_bytes, cudaMemcpyHostToDevice, stream_));
+  }
+
+  void create_cudss_matrix() {
+    if (d_A_cudss_) (void)cudssMatrixDestroy(d_A_cudss_);
+
+    constexpr cudaDataType_t idx_type = cudss_index_type<StorageIndex>::value;
+    constexpr cudaDataType_t val_type = cuda_data_type<Scalar>::value;
+    constexpr cudssMatrixType_t mtype = Derived::cudss_matrix_type();
+    constexpr cudssMatrixViewType_t mview = Derived::cudss_matrix_view();
+
+    EIGEN_CUDSS_CHECK(cudssMatrixCreateCsr(
+        &d_A_cudss_, static_cast<int64_t>(n_), static_cast<int64_t>(n_), static_cast<int64_t>(nnz_), d_rowPtr_.ptr,
+        /*rowEnd=*/nullptr, d_colIdx_.ptr, d_values_.ptr, idx_type, val_type, mtype, mview, CUDSS_BASE_ZERO));
+  }
+
+  void apply_ordering_config() {
+    cudssAlgType_t alg;
+    switch (ordering_) {
+      case GpuSparseOrdering::AMD:
+        alg = CUDSS_ALG_DEFAULT;
+        break;
+      case GpuSparseOrdering::METIS:
+        alg = CUDSS_ALG_2;
+        break;
+      case GpuSparseOrdering::RCM:
+        alg = CUDSS_ALG_3;
+        break;
+      default:
+        alg = CUDSS_ALG_DEFAULT;
+        break;
+    }
+    EIGEN_CUDSS_CHECK(cudssConfigSet(config_, CUDSS_CONFIG_REORDERING_ALG, &alg, sizeof(alg)));
+  }
+
+  void create_placeholder_dense() {
+    if (d_x_cudss_) (void)cudssMatrixDestroy(d_x_cudss_);
+    if (d_b_cudss_) (void)cudssMatrixDestroy(d_b_cudss_);
+    constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
+    EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_x_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
+                                          dtype, CUDSS_LAYOUT_COL_MAJOR));
+    EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_b_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
+                                          dtype, CUDSS_LAYOUT_COL_MAJOR));
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SPARSE_SOLVER_BASE_H
--- a/Eigen/src/GPU/GpuSupport.h
+++ b/Eigen/src/GPU/GpuSupport.h
@@ -0,0 +1,101 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Generic CUDA runtime support shared across all GPU library integrations
+// (cuSOLVER, cuBLAS, cuDSS, etc.):
+//   - Error-checking macros
+//   - RAII device buffer
+//
+// Only depends on <cuda_runtime.h>. No NVIDIA library headers.
+
+#ifndef EIGEN_GPU_SUPPORT_H
+#define EIGEN_GPU_SUPPORT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include <cuda_runtime.h>
+
+namespace Eigen {
+namespace internal {
+
+// ---- Error-checking macros --------------------------------------------------
+// These abort (via eigen_assert) on failure. Not for use in destructors.
+
+#define EIGEN_CUDA_RUNTIME_CHECK(expr)                             \
+  do {                                                             \
+    cudaError_t _e = (expr);                                       \
+    eigen_assert(_e == cudaSuccess && "CUDA runtime call failed"); \
+  } while (0)
+
+// ---- RAII: device buffer ----------------------------------------------------
+
+struct DeviceBuffer {
+  void* ptr = nullptr;
+
+  DeviceBuffer() = default;
+
+  explicit DeviceBuffer(size_t bytes) {
+    if (bytes > 0) EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes));
+  }
+
+  ~DeviceBuffer() {
+    if (ptr) (void)cudaFree(ptr);  // destructor: ignore errors
+  }
+
+  // Move-only.
+  DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr) { o.ptr = nullptr; }
+  DeviceBuffer& operator=(DeviceBuffer&& o) noexcept {
+    if (this != &o) {
+      if (ptr) (void)cudaFree(ptr);
+      ptr = o.ptr;
+      o.ptr = nullptr;
+    }
+    return *this;
+  }
+
+  DeviceBuffer(const DeviceBuffer&) = delete;
+  DeviceBuffer& operator=(const DeviceBuffer&) = delete;
+
+  // Adopt an existing device pointer. Caller relinquishes ownership.
+  static DeviceBuffer adopt(void* p) {
+    DeviceBuffer b;
+    b.ptr = p;
+    return b;
+  }
+};
+
+// ---- Scalar → cudaDataType_t ------------------------------------------------
+// Shared by cuBLAS and cuSOLVER. cudaDataType_t is defined in library_types.h
+// which is included transitively by cuda_runtime.h.
+
+template <typename Scalar>
+struct cuda_data_type;
+
+template <>
+struct cuda_data_type<float> {
+  static constexpr cudaDataType_t value = CUDA_R_32F;
+};
+template <>
+struct cuda_data_type<double> {
+  static constexpr cudaDataType_t value = CUDA_R_64F;
+};
+template <>
+struct cuda_data_type<std::complex<float>> {
+  static constexpr cudaDataType_t value = CUDA_C_32F;
+};
+template <>
+struct cuda_data_type<std::complex<double>> {
+  static constexpr cudaDataType_t value = CUDA_C_64F;
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SUPPORT_H
--- a/Eigen/src/GPU/InternalHeaderCheck.h
+++ b/Eigen/src/GPU/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_GPU_MODULE_H
+#error "Please include Eigen/GPU instead of including headers inside the src/GPU directory directly."
+#endif
--- a/Eigen/src/GPU/README.md
+++ b/Eigen/src/GPU/README.md
@@ -0,0 +1,636 @@
+# Eigen GPU Module (`Eigen/GPU`)
+
+GPU-accelerated linear algebra for Eigen users, dispatching to NVIDIA CUDA
+libraries (cuBLAS, cuSOLVER, cuFFT, cuSPARSE, cuDSS). Requires CUDA 11.4+;
+cuDSS features require CUDA 12.0+ and a separate cuDSS install. Header-only.
+
+## Why this module
+
+Eigen is the linear algebra foundation for a large ecosystem of C++ projects
+in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP,
+Open3D), scientific computing (Ceres, Stan), and beyond. Many of these
+projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations
+without dropping down to raw CUDA library APIs.
+
+GPU sparse solvers are a particularly acute gap. Sparse factorization is the
+bottleneck in SLAM, bundle adjustment, FEM, and nonlinear optimization --
+exactly the workloads where GPU acceleration matters most. Downstream projects
+like [Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
+[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
+GPU-accelerated sparse solvers, and third-party projects like
+[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically because
+Eigen lacks them. The `Eigen/GPU` module provides GPU sparse Cholesky, LDL^T,
+and LU factorization via cuDSS, alongside dense solvers (cuSOLVER), matrix
+products (cuBLAS), FFT (cuFFT), and sparse matrix-vector products (cuSPARSE).
+
+Existing Eigen users should be able to move performance-critical dense or
+sparse linear algebra to the GPU with minimal code changes and without
+learning CUDA library APIs directly.
+
+## Design philosophy
+
+**CPU and GPU coexist.** There is no global compile-time switch that replaces
+CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers
+explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>`, `GpuSparseLLT<double>` vs
+`SimplicialLLT<SparseMatrix<double>>` -- and both coexist in the same binary.
+This also lets users keep the factored matrix on device across multiple solves,
+something impossible with compile-time replacement.
+
+**Familiar syntax.** GPU operations use the same expression patterns as CPU
+Eigen. Here is a side-by-side comparison:
+
+```cpp
+// ---- CPU (Eigen) ----               // ---- GPU (Eigen/GPU) ----
+#include <Eigen/Dense>                  #define EIGEN_USE_GPU
+                                        #include <Eigen/GPU>
+
+// Dense
+MatrixXd A = ...;                       auto d_A = DeviceMatrix<double>::fromHost(A);
+MatrixXd B = ...;                       auto d_B = DeviceMatrix<double>::fromHost(B);
+
+MatrixXd C = A * B;                     DeviceMatrix<double> d_C = d_A * d_B;
+MatrixXd X = A.llt().solve(B);          DeviceMatrix<double> d_X = d_A.llt().solve(d_B);
+
+                                        MatrixXd X = d_X.toHost();
+
+// Sparse (using SpMat = SparseMatrix<double>)
+SimplicialLLT<SpMat> llt(A);            GpuSparseLLT<double> llt(A);
+VectorXd x = llt.solve(b);              VectorXd x = llt.solve(b);
+```
+
+The GPU version reads like CPU Eigen with explicit upload/download for dense
+operations, and an almost identical API for sparse solvers. Unsupported
+expressions are compile errors.
+
+**Explicit over implicit.** Host-device transfers, stream management, and
+library handle lifetimes are visible in the API. There are no hidden
+allocations or synchronizations except where documented (e.g., `toHost()` must
+synchronize to deliver data to the host).
+
+## Key concepts
+
+### `DeviceMatrix<Scalar>`
+
+A typed RAII wrapper for a dense column-major matrix in GPU device memory.
+This is the GPU counterpart of Eigen's `MatrixX<Scalar>`. A vector is simply
+a `DeviceMatrix` with one column.
+
+```cpp
+// Upload from host
+auto d_A = DeviceMatrix<double>::fromHost(A);
+
+// Allocate uninitialized
+DeviceMatrix<double> d_C(m, n);
+
+// Download to host
+MatrixXd C = d_C.toHost();
+
+// Async download (returns a future)
+auto transfer = d_C.toHostAsync();
+// ... do other work ...
+MatrixXd C = transfer.get();
+```
+
+`DeviceMatrix` supports expression methods that mirror Eigen's API:
+`adjoint()`, `transpose()`, `triangularView<UpLo>()`,
+`selfadjointView<UpLo>()`, `llt()`, `lu()`. These return lightweight
+expression objects that are evaluated when assigned.
+
+### `GpuContext`
+
+Every GPU operation needs a CUDA stream and library handles (cuBLAS,
+cuSOLVER). `GpuContext` bundles these together.
+
+For simple usage, you don't need to create one -- a per-thread default context
+is created lazily on first use:
+
+```cpp
+// These use the thread-local default context automatically
+d_C = d_A * d_B;
+d_X = d_A.llt().solve(d_B);
+```
+
+For concurrent multi-stream execution, create explicit contexts:
+
+```cpp
+GpuContext ctx1, ctx2;
+d_C1.device(ctx1) = d_A1 * d_B1;   // runs on stream 1
+d_C2.device(ctx2) = d_A2 * d_B2;   // runs on stream 2 (concurrently)
+```
+
+## Usage
+
+### Matrix operations (cuBLAS)
+
+```cpp
+auto d_A = DeviceMatrix<double>::fromHost(A);
+auto d_B = DeviceMatrix<double>::fromHost(B);
+
+// GEMM: C = A * B, C = A^H * B, C = A * B^T, ...
+DeviceMatrix<double> d_C = d_A * d_B;
+d_C = d_A.adjoint() * d_B;
+d_C = d_A * d_B.transpose();
+
+// Scaled and accumulated
+d_C += 2.0 * d_A * d_B;             // alpha=2, beta=1
+d_C.device(ctx) -= d_A * d_B;       // alpha=-1, beta=1 (requires explicit context)
+
+// Triangular solve (TRSM)
+d_X = d_A.triangularView<Lower>().solve(d_B);
+
+// Symmetric/Hermitian multiply (SYMM/HEMM)
+d_C = d_A.selfadjointView<Lower>() * d_B;
+
+// Rank-k update (SYRK/HERK)
+d_C.selfadjointView<Lower>().rankUpdate(d_A);  // C += A * A^H
+```
+
+### Dense solvers (cuSOLVER)
+
+**One-shot expression syntax** -- Convenient, re-factorizes each time:
+
+```cpp
+// Cholesky solve (potrf + potrs)
+d_X = d_A.llt().solve(d_B);
+
+// LU solve (getrf + getrs)
+d_Y = d_A.lu().solve(d_B);
+```
+
+**Cached factorization** -- Factor once, solve many times:
+
+```cpp
+GpuLLT<double> llt;
+llt.compute(d_A);                    // factorize (async)
+if (llt.info() != Success) { ... }   // lazy sync on first info() call
+auto d_X1 = llt.solve(d_B1);        // reuses factor (async)
+auto d_X2 = llt.solve(d_B2);        // reuses factor (async)
+MatrixXd X2 = d_X2.toHost();
+
+// LU with transpose solve
+GpuLU<double> lu;
+lu.compute(d_A);
+auto d_Y = lu.solve(d_B, GpuLU<double>::Transpose);  // A^T Y = B
+
+// QR solve (overdetermined least squares)
+GpuQR<double> qr;
+qr.compute(d_A);                     // factorize on device (async)
+auto d_X = qr.solve(d_B);           // Q^H * B via ormqr, then trsm on R
+MatrixXd X = d_X.toHost();
+
+// SVD (results downloaded on access)
+GpuSVD<double> svd;
+svd.compute(d_A, ComputeThinU | ComputeThinV);
+VectorXd S = svd.singularValues();   // downloads to host
+MatrixXd U = svd.matrixU();          // downloads to host
+MatrixXd VT = svd.matrixVT();        // V^T (matches cuSOLVER)
+
+// Self-adjoint eigenvalue decomposition (results downloaded on access)
+GpuSelfAdjointEigenSolver<double> es;
+es.compute(d_A);
+VectorXd eigenvals = es.eigenvalues();    // downloads to host
+MatrixXd eigenvecs = es.eigenvectors();   // downloads to host
+```
+
+The cached API keeps the factored matrix on device, avoiding redundant
+host-device transfers and re-factorizations. All solvers also accept host
+matrices directly as a convenience (e.g., `GpuLLT<double> llt(A)` or
+`qr.solve(B)`), which handles upload/download internally.
+
+### Sparse direct solvers (cuDSS)
+
+Requires cuDSS (separate install, CUDA 12.0+). Define `EIGEN_CUDSS` before
+including `Eigen/GPU` and link with `-lcudss`.
+
+```cpp
+SparseMatrix<double> A = ...;  // symmetric positive definite
+VectorXd b = ...;
+
+// Sparse Cholesky -- one-liner
+GpuSparseLLT<double> llt(A);
+VectorXd x = llt.solve(b);
+
+// Three-phase workflow for repeated solves with the same sparsity pattern
+GpuSparseLLT<double> llt;
+llt.analyzePattern(A);               // symbolic analysis (once)
+llt.factorize(A);                    // numeric factorization
+VectorXd x = llt.solve(b);
+llt.factorize(A_new_values);         // refactorize (reuses symbolic analysis)
+VectorXd x2 = llt.solve(b);
+
+// Sparse LDL^T (symmetric indefinite)
+GpuSparseLDLT<double> ldlt(A);
+VectorXd x = ldlt.solve(b);
+
+// Sparse LU (general non-symmetric)
+GpuSparseLU<double> lu(A);
+VectorXd x = lu.solve(b);
+```
+
+### FFT (cuFFT)
+
+```cpp
+GpuFFT<float> fft;
+
+// 1D complex-to-complex
+VectorXcf X = fft.fwd(x);           // forward
+VectorXcf y = fft.inv(X);           // inverse (scaled by 1/n)
+
+// 1D real-to-complex / complex-to-real
+VectorXcf R = fft.fwd(r);           // returns n/2+1 complex (half-spectrum)
+VectorXf  s = fft.invReal(R, n);    // C2R inverse, caller specifies n
+
+// 2D complex-to-complex
+MatrixXcf B = fft.fwd2d(A);         // 2D forward
+MatrixXcf C = fft.inv2d(B);         // 2D inverse (scaled by 1/(rows*cols))
+
+// Plans are cached and reused across calls with the same size/type.
+```
+
+### Sparse matrix-vector multiply (cuSPARSE)
+
+```cpp
+SparseMatrix<double> A = ...;
+VectorXd x = ...;
+
+GpuSparseContext<double> ctx;
+VectorXd y = ctx.multiply(A, x);            // y = A * x
+VectorXd z = ctx.multiplyT(A, x);           // z = A^T * x
+ctx.multiply(A, x, y, 2.0, 1.0);            // y = 2*A*x + y
+
+// Multiple RHS (SpMM)
+MatrixXd Y = ctx.multiplyMat(A, X);         // Y = A * X
+```
+
+### Precision control
+
+GEMM dispatch enables tensor core algorithms by default, allowing cuBLAS to
+choose the fastest algorithm for the given precision and architecture. For
+double precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64
+results computed faster via tensor cores.
+
+| Macro | Effect |
+|---|---|
+| *(default)* | Tensor core algorithms enabled. Float uses full FP32. Double may use Ozaki on sm_80+. |
+| `EIGEN_CUDA_TF32` | Opt-in: Float uses TF32 (~2x faster, 10-bit mantissa). Double unaffected. |
+| `EIGEN_NO_CUDA_TENSOR_OPS` | Opt-out: Pedantic compute types, no tensor cores. For bit-exact reproducibility. |
+
+### Stream control and async execution
+
+Operations are asynchronous by default. The compute-solve chain runs without
+host synchronization until you need a result on the host:
+
+```
+fromHost(A) --sync-->  compute() --async-->  solve() --async-->  toHost()
+   H2D                  potrf                 potrs                D2H
+                                                                   sync
+```
+
+Mandatory sync points:
+- `fromHost()` -- Synchronizes to complete the upload before returning
+- `toHost()` / `HostTransfer::get()` -- Must deliver data to host
+- `info()` -- Must read the factorization status
+
+**Cross-stream safety** is automatic. `DeviceMatrix` tracks write completion
+via CUDA events. When a matrix written on stream A is read on stream B, the
+module automatically inserts `cudaStreamWaitEvent`. Same-stream operations
+skip the wait (CUDA guarantees in-order execution within a stream).
+
+## Reference
+
+### Supported scalar types
+
+`float`, `double`, `std::complex<float>`, `std::complex<double>` (unless
+noted otherwise).
+
+### Expression -> library call mapping
+
+| DeviceMatrix expression | Library call | Parameters |
+|---|---|---|
+| `C = A * B` | `cublasGemmEx` | transA=N, transB=N, alpha=1, beta=0 |
+| `C = A.adjoint() * B` | `cublasGemmEx` | transA=C, transB=N |
+| `C = A.transpose() * B` | `cublasGemmEx` | transA=T, transB=N |
+| `C = A * B.adjoint()` | `cublasGemmEx` | transA=N, transB=C |
+| `C = A * B.transpose()` | `cublasGemmEx` | transA=N, transB=T |
+| `C = alpha * A * B` | `cublasGemmEx` | alpha from LHS |
+| `C = A * (alpha * B)` | `cublasGemmEx` | alpha from RHS |
+| `C += A * B` | `cublasGemmEx` | alpha=1, beta=1 |
+| `C.device(ctx) -= A * B` | `cublasGemmEx` | alpha=-1, beta=1 |
+| `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs |
+| `X = A.llt<Upper>().solve(B)` | same | uplo=Upper |
+| `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs |
+| `X = A.triangularView<L>().solve(B)` | `cublasXtrsm` | side=L, uplo, diag=NonUnit |
+| `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
+| `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
+
+### `DeviceMatrix<Scalar>`
+
+Typed RAII wrapper for a dense column-major matrix in GPU device memory.
+Always dense (leading dimension = rows). A vector is a `DeviceMatrix` with
+one column.
+
+```cpp
+// Construction
+DeviceMatrix<Scalar>()                                   // Empty (0x0)
+DeviceMatrix<Scalar>(rows, cols)                         // Allocate uninitialized
+
+// Upload / download
+static DeviceMatrix fromHost(matrix, stream=nullptr)           // -> DeviceMatrix (syncs)
+static DeviceMatrix fromHostAsync(ptr, rows, cols, outerStride, s)  // -> DeviceMatrix (no sync, caller manages ptr lifetime)
+PlainMatrix        toHost(stream=nullptr)                      // -> host Matrix (syncs)
+HostTransfer       toHostAsync(stream=nullptr)                 // -> HostTransfer future (no sync)
+DeviceMatrix       clone(stream=nullptr)                       // -> DeviceMatrix (D2D copy, async)
+
+// Dimensions and access
+Index   rows()
+Index   cols()
+size_t  sizeInBytes()
+bool    empty()
+Scalar* data()                                           // Raw device pointer
+void    resize(Index rows, Index cols)                   // Discard contents, reallocate
+
+// Expression builders (return lightweight views, evaluated on assignment)
+AdjointView       adjoint()                              // GEMM with ConjTrans
+TransposeView     transpose()                            // GEMM with Trans
+LltExpr            llt() / llt<UpLo>()                   // -> .solve(d_B) -> DeviceMatrix
+LuExpr             lu()                                  // -> .solve(d_B) -> DeviceMatrix
+TriangularView     triangularView<UpLo>()                // -> .solve(d_B) -> DeviceMatrix (TRSM)
+SelfAdjointView    selfadjointView<UpLo>()               // -> * d_B (SYMM), .rankUpdate(d_A) (SYRK)
+DeviceAssignment   device(GpuContext& ctx)                // Bind assignment to explicit stream
+```
+
+### `GpuContext`
+
+Unified GPU execution context owning a CUDA stream and library handles.
+
+```cpp
+GpuContext()                                             // Creates dedicated stream + handles
+static GpuContext& threadLocal()                         // Per-thread default (lazy-created)
+
+cudaStream_t       stream()
+cublasHandle_t     cublasHandle()
+cusolverDnHandle_t cusolverHandle()
+```
+
+Non-copyable, non-movable (owns library handles).
+
+### `GpuLLT<Scalar, UpLo>` -- Dense Cholesky (cuSOLVER)
+
+Caches the Cholesky factor on device for repeated solves.
+
+```cpp
+GpuLLT()                                                // Default construct, then call compute()
+GpuLLT(const EigenBase<D>& A)                           // Convenience: upload + factorize
+
+GpuLLT&            compute(const EigenBase<D>& A)       // Upload + factorize
+GpuLLT&            compute(const DeviceMatrix& d_A)     // D2D copy + factorize
+GpuLLT&            compute(DeviceMatrix&& d_A)          // Adopt + factorize (no copy)
+
+PlainMatrix        solve(const MatrixBase<D>& B)         // -> host Matrix (syncs)
+DeviceMatrix       solve(const DeviceMatrix& d_B)        // -> DeviceMatrix (async, stays on device)
+
+ComputationInfo    info()                                // Lazy sync on first call: Success or NumericalIssue
+Index              rows() / cols()
+cudaStream_t       stream()
+```
+
+### `GpuLU<Scalar>` -- Dense LU (cuSOLVER)
+
+Same pattern as `GpuLLT`. Adds `TransposeMode` parameter on `solve()`.
+
+```cpp
+PlainMatrix        solve(const MatrixBase<D>& B, TransposeMode m = NoTranspose)  // -> host Matrix
+DeviceMatrix       solve(const DeviceMatrix& d_B, TransposeMode m = NoTranspose) // -> DeviceMatrix
+```
+
+`TransposeMode`: `NoTranspose`, `Transpose`, `ConjugateTranspose`.
+
+### `GpuQR<Scalar>` -- Dense QR (cuSOLVER)
+
+QR factorization via `cusolverDnXgeqrf`. Solve uses ORMQR (apply Q^H) + TRSM
+(back-substitute on R) -- Q is never formed explicitly.
+
+```cpp
+GpuQR()                                                  // Default construct
+GpuQR(const EigenBase<D>& A)                             // Convenience: upload + factorize
+
+GpuQR&             compute(const EigenBase<D>& A)        // Upload + factorize
+GpuQR&             compute(const DeviceMatrix& d_A)      // D2D copy + factorize
+
+PlainMatrix        solve(const MatrixBase<D>& B)         // -> host Matrix (syncs)
+DeviceMatrix       solve(const DeviceMatrix& d_B)        // -> DeviceMatrix (async)
+
+ComputationInfo    info()                                // Lazy sync
+Index              rows() / cols()
+cudaStream_t       stream()
+```
+
+### `GpuSVD<Scalar>` -- Dense SVD (cuSOLVER)
+
+SVD via `cusolverDnXgesvd`. Supports `ComputeThinU | ComputeThinV`,
+`ComputeFullU | ComputeFullV`, or `0` (values only). Wide matrices (m < n)
+handled by internal transpose.
+
+```cpp
+GpuSVD()                                                 // Default construct, then call compute()
+GpuSVD(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV)  // Convenience
+
+GpuSVD&            compute(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV)
+GpuSVD&            compute(const DeviceMatrix& d_A, unsigned options = ComputeThinU | ComputeThinV)
+
+RealVector         singularValues()                      // -> host vector (syncs, downloads)
+PlainMatrix        matrixU()                             // -> host Matrix (syncs, downloads)
+PlainMatrix        matrixVT()                            // -> host Matrix (syncs, downloads V^T)
+
+PlainMatrix        solve(const MatrixBase<D>& B)         // -> host Matrix (pseudoinverse)
+PlainMatrix        solve(const MatrixBase<D>& B, Index k)       // Truncated (top k triplets)
+PlainMatrix        solve(const MatrixBase<D>& B, RealScalar l)  // Tikhonov regularized
+
+Index              rank(RealScalar threshold = -1)
+ComputationInfo    info()                                // Lazy sync
+Index              rows() / cols()
+cudaStream_t       stream()
+```
+
+**Note:** `singularValues()`, `matrixU()`, and `matrixVT()` download to host
+on each call. Device-side accessors returning `DeviceMatrix` are planned but
+not yet implemented.
+
+### `GpuSelfAdjointEigenSolver<Scalar>` -- Eigendecomposition (cuSOLVER)
+
+Symmetric/Hermitian eigenvalue decomposition via `cusolverDnXsyevd`.
+`ComputeMode` enum: `EigenvaluesOnly`, `ComputeEigenvectors`.
+
+```cpp
+GpuSelfAdjointEigenSolver()                              // Default construct, then call compute()
+GpuSelfAdjointEigenSolver(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors)  // Convenience
+
+GpuSelfAdjointEigenSolver& compute(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors)
+GpuSelfAdjointEigenSolver& compute(const DeviceMatrix& d_A, ComputeMode mode = ComputeEigenvectors)
+
+RealVector         eigenvalues()                         // -> host vector (syncs, downloads, ascending order)
+PlainMatrix        eigenvectors()                        // -> host Matrix (syncs, downloads, columns)
+
+ComputationInfo    info()                                // Lazy sync
+Index              rows() / cols()
+cudaStream_t       stream()
+```
+
+**Note:** `eigenvalues()` and `eigenvectors()` download to host on each call.
+Device-side accessors returning `DeviceMatrix` are planned but not yet
+implemented.
+
+### `HostTransfer<Scalar>`
+
+Future for async device-to-host transfer. Returned by
+`DeviceMatrix::toHostAsync()`.
+
+```cpp
+PlainMatrix&       get()                                 // Block until complete, return host Matrix ref. Idempotent.
+bool               ready()                               // Non-blocking poll
+```
+
+### `GpuSparseLLT<Scalar, UpLo>` -- Sparse Cholesky (cuDSS)
+
+Requires cuDSS (CUDA 12.0+, `#define EIGEN_CUDSS`). Three-phase workflow
+with symbolic reuse. Accepts `SparseMatrix<Scalar, ColMajor, int>` (CSC).
+
+```cpp
+GpuSparseLLT()                                           // Default construct
+GpuSparseLLT(const SparseMatrixBase<D>& A)               // Analyze + factorize
+
+GpuSparseLLT&      analyzePattern(const SparseMatrixBase<D>& A)  // Symbolic analysis (reusable)
+GpuSparseLLT&      factorize(const SparseMatrixBase<D>& A)       // Numeric factorization
+GpuSparseLLT&      compute(const SparseMatrixBase<D>& A)         // analyzePattern + factorize
+void               setOrdering(GpuSparseOrdering ord)             // AMD (default), METIS, or RCM
+
+DenseMatrix        solve(const MatrixBase<D>& B)         // -> host Matrix (syncs)
+
+ComputationInfo    info()                                // Lazy sync
+Index              rows() / cols()
+cudaStream_t       stream()
+```
+
+### `GpuSparseLDLT<Scalar, UpLo>` -- Sparse LDL^T (cuDSS)
+
+Symmetric indefinite. Same API as `GpuSparseLLT`.
+
+### `GpuSparseLU<Scalar>` -- Sparse LU (cuDSS)
+
+General non-symmetric. Same API as `GpuSparseLLT` (without `UpLo`).
+
+### `GpuFFT<Scalar>` -- FFT (cuFFT)
+
+Plans cached by (size, type) and reused. Inverse transforms scaled so
+`inv(fwd(x)) == x`. Supported scalars: `float`, `double`.
+
+```cpp
+// 1D transforms (host vectors in and out)
+ComplexVector      fwd(const MatrixBase<D>& x)           // C2C forward (complex input)
+ComplexVector      fwd(const MatrixBase<D>& x)           // R2C forward (real input, returns n/2+1)
+ComplexVector      inv(const MatrixBase<D>& X)           // C2C inverse, scaled by 1/n
+RealVector         invReal(const MatrixBase<D>& X, Index n)  // C2R inverse, scaled by 1/n
+
+// 2D transforms (host matrices in and out)
+ComplexMatrix      fwd2d(const MatrixBase<D>& A)         // 2D C2C forward
+ComplexMatrix      inv2d(const MatrixBase<D>& A)         // 2D C2C inverse, scaled by 1/(rows*cols)
+
+cudaStream_t       stream()
+```
+
+All FFT methods accept host data and return host data. Upload/download is
+handled internally. The C2C and R2C overloads of `fwd()` are distinguished by
+the input scalar type (complex vs real).
+
+### `GpuSparseContext<Scalar>` -- SpMV/SpMM (cuSPARSE)
+
+Accepts `SparseMatrix<Scalar, ColMajor>`. All methods accept host data and
+return host data.
+
+```cpp
+GpuSparseContext()                                       // Creates own stream + cuSPARSE handle
+
+DenseVector        multiply(A, x)                                       // y = A * x
+void               multiply(A, x, y, alpha=1, beta=0,                   // y = alpha*op(A)*x + beta*y
+                     op=CUSPARSE_OPERATION_NON_TRANSPOSE)
+DenseVector        multiplyT(A, x)                                      // y = A^T * x
+DenseMatrix        multiplyMat(A, X)                                    // Y = A * X (SpMM)
+
+cudaStream_t       stream()
+```
+
+### Aliasing
+
+Unlike Eigen's `Matrix`, where omitting `.noalias()` triggers a copy to a
+temporary, DeviceMatrix dispatches directly to NVIDIA library calls which have
+no built-in aliasing protection. All operations are implicitly noalias.
+The caller must ensure operands don't alias the destination for GEMM and TRSM
+(debug asserts catch violations).
+
+## File layout
+
+| File | Depends on | Contents |
+|------|-----------|----------|
+| `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `cuda_data_type<>` |
+| `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` |
+| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM expression wrappers |
+| `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers |
+| `DeviceSolverExpr.h` | `DeviceMatrix.h` | Solver expression wrappers (LLT, LU) |
+| `DeviceDispatch.h` | all above | All dispatch functions + `DeviceAssignment` |
+| `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `GpuContext` |
+| `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>` | cuBLAS error macro, op/compute type maps |
+| `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
+| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
+| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
+| `GpuQR.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense QR decomposition |
+| `GpuSVD.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense SVD decomposition |
+| `GpuEigenSolver.h` | `CuSolverSupport.h` | Self-adjoint eigenvalue decomposition |
+| `CuFftSupport.h` | `GpuSupport.h`, `<cufft.h>` | cuFFT error macro, type-dispatch wrappers |
+| `GpuFFT.h` | `CuFftSupport.h`, `CuBlasSupport.h` | 1D/2D FFT with plan caching |
+| `CuSparseSupport.h` | `GpuSupport.h`, `<cusparse.h>` | cuSPARSE error macro |
+| `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE |
+| `CuDssSupport.h` | `GpuSupport.h`, `<cudss.h>` | cuDSS error macro, type traits (optional) |
+| `GpuSparseSolverBase.h` | `CuDssSupport.h` | CRTP base for sparse solvers (optional) |
+| `GpuSparseLLT.h` | `GpuSparseSolverBase.h` | Sparse Cholesky via cuDSS (optional) |
+| `GpuSparseLDLT.h` | `GpuSparseSolverBase.h` | Sparse LDL^T via cuDSS (optional) |
+| `GpuSparseLU.h` | `GpuSparseSolverBase.h` | Sparse LU via cuDSS (optional) |
+
+## Building and testing
+
+```bash
+cmake -G Ninja -B build -S . \
+  -DEIGEN_TEST_CUDA=ON \
+  -DEIGEN_CUDA_COMPUTE_ARCH="70" \
+  -DEIGEN_TEST_CUBLAS=ON \
+  -DEIGEN_TEST_CUSOLVER=ON
+
+cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu \
+  gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen \
+  gpu_device_matrix gpu_cufft gpu_cusparse_spmv
+ctest --test-dir build -R "gpu_" --output-on-failure
+
+# Sparse solvers (cuDSS -- separate install required)
+cmake -G Ninja -B build -S . \
+  -DEIGEN_TEST_CUDA=ON \
+  -DEIGEN_CUDA_COMPUTE_ARCH="70" \
+  -DEIGEN_TEST_CUDSS=ON
+
+cmake --build build --target gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu
+ctest --test-dir build -R gpu_cudss --output-on-failure
+```
+
+## Future work
+
+- **Device-side accessors for decomposition results.** `GpuSVD`,
+  `GpuSelfAdjointEigenSolver`, and `GpuQR` currently download decomposition
+  results to host on access (e.g., `svd.matrixU()` returns a host `MatrixXd`).
+  Device-side accessors returning `DeviceMatrix` views of the internal buffers
+  would allow chaining GPU operations (e.g., `svd.deviceU() * d_A`) without
+  round-tripping through host memory.
+- **Device-resident sparse matrix-vector products.** `GpuSparseContext`
+  currently operates on host vectors and matrices, uploading and downloading
+  on each call. The key missing piece is a `DeviceSparseView` that holds a
+  sparse matrix on device and supports operator syntax (`d_y = d_A * d_x`)
+  with `DeviceMatrix` operands -- keeping the entire SpMV/SpMM pipeline on
+  device. This is essential for iterative solvers and any workflow that chains
+  sparse and dense operations without returning to the host.
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -43,3 +43,10 @@ add_subdirectory(Householder)
 add_subdirectory(Solvers)
 add_subdirectory(Tuning)
 add_subdirectory(BLAS)
+
+# GPU benchmarks have their own CMake project (needs CUDAToolkit).
+# They can also be built standalone: cmake -B build -S benchmarks/GPU
+find_package(CUDAToolkit QUIET)
+if(CUDAToolkit_FOUND)
+  add_subdirectory(GPU)
+endif()
--- a/benchmarks/GPU/CMakeLists.txt
+++ b/benchmarks/GPU/CMakeLists.txt
@@ -0,0 +1,57 @@
+# GPU benchmarks require CUDA runtime + cuSOLVER.
+# Build separately from the main benchmark tree since they need CUDA toolchain.
+#
+# Usage:
+#   cmake -G Ninja -B build-bench-gpu -S benchmarks/GPU \
+#         -DCMAKE_CUDA_ARCHITECTURES=89
+#   cmake --build build-bench-gpu
+#
+# Profiling:
+#   nsys profile --trace=cuda ./build-bench-gpu/bench_gpu_solvers
+#   ncu --set full -o profile ./build-bench-gpu/bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
+
+cmake_minimum_required(VERSION 3.18)
+project(EigenGpuBenchmarks CXX)
+
+find_package(benchmark REQUIRED)
+find_package(CUDAToolkit REQUIRED)
+
+set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
+
+function(eigen_add_gpu_benchmark name source)
+  cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN})
+  if(NOT IS_ABSOLUTE "${source}")
+    set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}")
+  endif()
+  add_executable(${name} ${source})
+  target_include_directories(${name} PRIVATE
+    ${EIGEN_SOURCE_DIR}
+    ${CUDAToolkit_INCLUDE_DIRS})
+  target_link_libraries(${name} PRIVATE
+    benchmark::benchmark benchmark::benchmark_main
+    CUDA::cudart CUDA::cusolver CUDA::cublas)
+  if(BENCH_LIBRARIES)
+    target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
+  endif()
+  target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
+  target_compile_definitions(${name} PRIVATE EIGEN_USE_GPU)
+  if(BENCH_DEFINITIONS)
+    target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
+  endif()
+endfunction()
+
+# Solver benchmarks: LLT/LU compute + solve, host vs device paths, CPU baselines.
+eigen_add_gpu_benchmark(bench_gpu_solvers bench_gpu_solvers.cpp)
+eigen_add_gpu_benchmark(bench_gpu_solvers_float bench_gpu_solvers.cpp DEFINITIONS SCALAR=float)
+
+# Chaining benchmarks: async pipeline efficiency, host-roundtrip vs device chain.
+eigen_add_gpu_benchmark(bench_gpu_chaining bench_gpu_chaining.cpp)
+eigen_add_gpu_benchmark(bench_gpu_chaining_float bench_gpu_chaining.cpp DEFINITIONS SCALAR=float)
+
+# Batching benchmarks: multi-stream concurrency for many small systems.
+eigen_add_gpu_benchmark(bench_gpu_batching bench_gpu_batching.cpp)
+eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITIONS SCALAR=float)
+
+# FFT benchmarks: 1D/2D C2C, R2C, C2R throughput and plan reuse.
+eigen_add_gpu_benchmark(bench_gpu_fft bench_gpu_fft.cpp LIBRARIES CUDA::cufft)
+eigen_add_gpu_benchmark(bench_gpu_fft_double bench_gpu_fft.cpp LIBRARIES CUDA::cufft DEFINITIONS SCALAR=double)
--- a/benchmarks/GPU/bench_gpu_batching.cpp
+++ b/benchmarks/GPU/bench_gpu_batching.cpp
@@ -0,0 +1,268 @@
+// GPU batching benchmarks: multi-stream concurrency for many small solves.
+//
+// Each GpuLLT/GpuLU owns its own CUDA stream. This benchmark measures how
+// well multiple solver instances overlap on the GPU, which is critical for
+// workloads like robotics (many small systems) and SLAM (batched poses).
+//
+// Compares:
+//   1. Sequential: one solver handles all systems one by one
+//   2. Batched: N solvers on N streams, all launched before any sync
+//   3. CPU baseline: Eigen LLT on host
+//
+// For Nsight Systems: batched mode should show overlapping kernels on
+// different streams in the timeline view.
+//
+//   nsys profile --trace=cuda ./bench_gpu_batching
+
+#include <benchmark/benchmark.h>
+
+#include <Eigen/Cholesky>
+#include <Eigen/GPU>
+
+#include <memory>
+#include <vector>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR double
+#endif
+
+using Scalar = SCALAR;
+using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+
+static Mat make_spd(Index n) {
+  Mat M = Mat::Random(n, n);
+  return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
+}
+
+static void cuda_warmup() {
+  static bool done = false;
+  if (!done) {
+    void* p;
+    cudaMalloc(&p, 1);
+    cudaFree(p);
+    done = true;
+  }
+}
+
+// --------------------------------------------------------------------------
+// Sequential: one solver, N systems solved one after another
+// --------------------------------------------------------------------------
+
+static void BM_Batch_Sequential(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int batch_size = static_cast<int>(state.range(1));
+
+  // Pre-generate all SPD matrices and RHS vectors.
+  std::vector<Mat> As(batch_size);
+  std::vector<Mat> Bs(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    As[i] = make_spd(n);
+    Bs[i] = Mat::Random(n, 1);
+  }
+
+  GpuLLT<Scalar> llt;
+
+  for (auto _ : state) {
+    std::vector<Mat> results(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      llt.compute(As[i]);
+      results[i] = llt.solve(Bs[i]);
+    }
+    benchmark::DoNotOptimize(results.back().data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["batch"] = batch_size;
+  state.counters["total_solves"] = batch_size;
+}
+
+// --------------------------------------------------------------------------
+// Sequential with DeviceMatrix (avoid re-upload of A each iteration)
+// --------------------------------------------------------------------------
+
+static void BM_Batch_Sequential_Device(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int batch_size = static_cast<int>(state.range(1));
+
+  std::vector<Mat> As(batch_size);
+  std::vector<Mat> Bs(batch_size);
+  std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
+  std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    As[i] = make_spd(n);
+    Bs[i] = Mat::Random(n, 1);
+    d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
+    d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
+  }
+
+  GpuLLT<Scalar> llt;
+
+  for (auto _ : state) {
+    std::vector<Mat> results(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      llt.compute(d_As[i]);
+      DeviceMatrix<Scalar> d_X = llt.solve(d_Bs[i]);
+      results[i] = d_X.toHost();
+    }
+    benchmark::DoNotOptimize(results.back().data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["batch"] = batch_size;
+  state.counters["total_solves"] = batch_size;
+}
+
+// --------------------------------------------------------------------------
+// Batched: N solvers on N streams, overlapping execution
+// --------------------------------------------------------------------------
+
+static void BM_Batch_MultiStream(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int batch_size = static_cast<int>(state.range(1));
+
+  std::vector<Mat> As(batch_size);
+  std::vector<Mat> Bs(batch_size);
+  std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
+  std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    As[i] = make_spd(n);
+    Bs[i] = Mat::Random(n, 1);
+    d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
+    d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
+  }
+
+  // N solvers = N independent CUDA streams.
+  std::vector<std::unique_ptr<GpuLLT<Scalar>>> solvers(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    solvers[i] = std::make_unique<GpuLLT<Scalar>>();
+  }
+
+  for (auto _ : state) {
+    // Phase 1: launch all factorizations (async, different streams).
+    for (int i = 0; i < batch_size; ++i) {
+      solvers[i]->compute(d_As[i]);
+    }
+
+    // Phase 2: launch all solves (async, different streams).
+    std::vector<DeviceMatrix<Scalar>> d_Xs(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      d_Xs[i] = solvers[i]->solve(d_Bs[i]);
+    }
+
+    // Phase 3: download all results.
+    std::vector<Mat> results(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      results[i] = d_Xs[i].toHost();
+    }
+    benchmark::DoNotOptimize(results.back().data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["batch"] = batch_size;
+  state.counters["streams"] = batch_size;
+  state.counters["total_solves"] = batch_size;
+}
+
+// --------------------------------------------------------------------------
+// Batched with async download (overlap D2H with computation)
+// --------------------------------------------------------------------------
+
+static void BM_Batch_MultiStream_AsyncDownload(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int batch_size = static_cast<int>(state.range(1));
+
+  std::vector<Mat> As(batch_size);
+  std::vector<Mat> Bs(batch_size);
+  std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
+  std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    As[i] = make_spd(n);
+    Bs[i] = Mat::Random(n, 1);
+    d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
+    d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
+  }
+
+  std::vector<std::unique_ptr<GpuLLT<Scalar>>> solvers(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    solvers[i] = std::make_unique<GpuLLT<Scalar>>();
+  }
+
+  for (auto _ : state) {
+    // Launch all compute + solve.
+    std::vector<DeviceMatrix<Scalar>> d_Xs(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      solvers[i]->compute(d_As[i]);
+      d_Xs[i] = solvers[i]->solve(d_Bs[i]);
+    }
+
+    // Enqueue all async downloads.
+    std::vector<HostTransfer<Scalar>> transfers;
+    transfers.reserve(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      transfers.push_back(d_Xs[i].toHostAsync());
+    }
+
+    // Collect all results.
+    for (int i = 0; i < batch_size; ++i) {
+      benchmark::DoNotOptimize(transfers[i].get().data());
+    }
+  }
+
+  state.counters["n"] = n;
+  state.counters["batch"] = batch_size;
+  state.counters["streams"] = batch_size;
+  state.counters["total_solves"] = batch_size;
+}
+
+// --------------------------------------------------------------------------
+// CPU baseline: Eigen LLT on host, sequential
+// --------------------------------------------------------------------------
+
+static void BM_Batch_CPU(benchmark::State& state) {
+  const Index n = state.range(0);
+  const int batch_size = static_cast<int>(state.range(1));
+
+  std::vector<Mat> As(batch_size);
+  std::vector<Mat> Bs(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    As[i] = make_spd(n);
+    Bs[i] = Mat::Random(n, 1);
+  }
+
+  for (auto _ : state) {
+    std::vector<Mat> results(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      LLT<Mat> llt(As[i]);
+      results[i] = llt.solve(Bs[i]);
+    }
+    benchmark::DoNotOptimize(results.back().data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["batch"] = batch_size;
+  state.counters["total_solves"] = batch_size;
+}
+
+// --------------------------------------------------------------------------
+// Registration
+// --------------------------------------------------------------------------
+
+// clang-format off
+// Args: {matrix_size, batch_size}
+// Small matrices with large batches are the interesting case for multi-stream.
+BENCHMARK(BM_Batch_Sequential)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Batch_Sequential_Device)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Batch_CPU)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
+
+// Also run larger sizes with moderate batching.
+BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond);
+// clang-format on
--- a/benchmarks/GPU/bench_gpu_chaining.cpp
+++ b/benchmarks/GPU/bench_gpu_chaining.cpp
@@ -0,0 +1,216 @@
+// GPU chaining benchmarks: measure async pipeline efficiency.
+//
+// Compares:
+//   1. Host round-trip per solve (baseline)
+//   2. DeviceMatrix chaining (no host round-trip between solves)
+//   3. Varying chain lengths (1, 2, 4, 8 consecutive solves)
+//
+// For Nsight Systems: look for gaps between kernel launches in the timeline.
+// Host round-trip creates visible idle gaps; chaining should show back-to-back kernels.
+//
+//   nsys profile --trace=cuda,nvtx ./bench_gpu_chaining
+
+#include <benchmark/benchmark.h>
+
+#include <Eigen/Cholesky>
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR double
+#endif
+
+using Scalar = SCALAR;
+using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+
+static Mat make_spd(Index n) {
+  Mat M = Mat::Random(n, n);
+  return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
+}
+
+static void cuda_warmup() {
+  static bool done = false;
+  if (!done) {
+    void* p;
+    cudaMalloc(&p, 1);
+    cudaFree(p);
+    done = true;
+  }
+}
+
+// --------------------------------------------------------------------------
+// Baseline: host round-trip between every solve
+// --------------------------------------------------------------------------
+
+static void BM_Chain_HostRoundtrip(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int chain_len = static_cast<int>(state.range(1));
+
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, 1);
+  GpuLLT<Scalar> llt(A);
+
+  for (auto _ : state) {
+    Mat X = B;
+    for (int i = 0; i < chain_len; ++i) {
+      X = llt.solve(X);  // host → device → host each time
+    }
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["chain"] = chain_len;
+  state.counters["solves/iter"] = chain_len;
+}
+
+// --------------------------------------------------------------------------
+// DeviceMatrix chaining: no host round-trip between solves
+// --------------------------------------------------------------------------
+
+static void BM_Chain_Device(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int chain_len = static_cast<int>(state.range(1));
+
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, 1);
+  GpuLLT<Scalar> llt(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  for (auto _ : state) {
+    DeviceMatrix<Scalar> d_X = llt.solve(d_B);
+    for (int i = 1; i < chain_len; ++i) {
+      d_X = llt.solve(d_X);  // device → device, fully async
+    }
+    Mat X = d_X.toHost();  // single sync at end
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["chain"] = chain_len;
+  state.counters["solves/iter"] = chain_len;
+}
+
+// --------------------------------------------------------------------------
+// DeviceMatrix chaining with async download (overlap D2H with next iteration)
+// --------------------------------------------------------------------------
+
+static void BM_Chain_DeviceAsync(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int chain_len = static_cast<int>(state.range(1));
+
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, 1);
+  GpuLLT<Scalar> llt(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  for (auto _ : state) {
+    DeviceMatrix<Scalar> d_X = llt.solve(d_B);
+    for (int i = 1; i < chain_len; ++i) {
+      d_X = llt.solve(d_X);
+    }
+    auto transfer = d_X.toHostAsync();
+    Mat X = transfer.get();
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["chain"] = chain_len;
+  state.counters["solves/iter"] = chain_len;
+}
+
+// --------------------------------------------------------------------------
+// Pure GPU chain (no download — measures kernel-only throughput)
+// --------------------------------------------------------------------------
+
+static void BM_Chain_DeviceNoDownload(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int chain_len = static_cast<int>(state.range(1));
+
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, 1);
+  GpuLLT<Scalar> llt(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  for (auto _ : state) {
+    DeviceMatrix<Scalar> d_X = llt.solve(d_B);
+    for (int i = 1; i < chain_len; ++i) {
+      d_X = llt.solve(d_X);
+    }
+    cudaStreamSynchronize(llt.stream());
+    benchmark::DoNotOptimize(d_X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["chain"] = chain_len;
+  state.counters["solves/iter"] = chain_len;
+}
+
+// --------------------------------------------------------------------------
+// Compute + solve chain (full pipeline: factorize, then chain solves)
+// --------------------------------------------------------------------------
+
+static void BM_FullPipeline_Host(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int chain_len = static_cast<int>(state.range(1));
+
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, 1);
+
+  for (auto _ : state) {
+    GpuLLT<Scalar> llt(A);
+    Mat X = B;
+    for (int i = 0; i < chain_len; ++i) {
+      X = llt.solve(X);
+    }
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["chain"] = chain_len;
+}
+
+static void BM_FullPipeline_Device(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int chain_len = static_cast<int>(state.range(1));
+
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, 1);
+
+  for (auto _ : state) {
+    auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+    auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+    GpuLLT<Scalar> llt;
+    llt.compute(d_A);
+    DeviceMatrix<Scalar> d_X = llt.solve(d_B);
+    for (int i = 1; i < chain_len; ++i) {
+      d_X = llt.solve(d_X);
+    }
+    Mat X = d_X.toHost();
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["chain"] = chain_len;
+}
+
+// --------------------------------------------------------------------------
+// Registration
+// --------------------------------------------------------------------------
+
+// clang-format off
+// Args: {matrix_size, chain_length}
+BENCHMARK(BM_Chain_HostRoundtrip)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Chain_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Chain_DeviceAsync)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Chain_DeviceNoDownload)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_FullPipeline_Host)->ArgsProduct({{256, 1024, 4096}, {1, 4}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_FullPipeline_Device)->ArgsProduct({{256, 1024, 4096}, {1, 4}})->Unit(benchmark::kMicrosecond);
+// clang-format on
--- a/benchmarks/GPU/bench_gpu_fft.cpp
+++ b/benchmarks/GPU/bench_gpu_fft.cpp
@@ -0,0 +1,185 @@
+// GPU FFT benchmarks: GpuFFT 1D and 2D throughput.
+//
+// Measures forward and inverse FFT performance across a range of sizes,
+// including plan-amortized (reuse) and cold-start (new plan) scenarios.
+//
+// Usage:
+//   cmake --build build-bench-gpu --target bench_gpu_fft
+//   ./build-bench-gpu/bench_gpu_fft
+//
+// Profiling:
+//   nsys profile --trace=cuda ./build-bench-gpu/bench_gpu_fft
+
+#include <benchmark/benchmark.h>
+
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+using Scalar = SCALAR;
+using Complex = std::complex<Scalar>;
+using CVec = Matrix<Complex, Dynamic, 1>;
+using RVec = Matrix<Scalar, Dynamic, 1>;
+using CMat = Matrix<Complex, Dynamic, Dynamic>;
+
+// CUDA warm-up: ensure the GPU is initialized before timing.
+static void cuda_warmup() {
+  static bool done = false;
+  if (!done) {
+    void* p;
+    cudaMalloc(&p, 1);
+    cudaFree(p);
+    done = true;
+  }
+}
+
+// --------------------------------------------------------------------------
+// 1D C2C Forward
+// --------------------------------------------------------------------------
+
+static void BM_GpuFFT_1D_C2C_Fwd(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  CVec x = CVec::Random(n);
+  GpuFFT<Scalar> fft;
+
+  // Warm up plan.
+  CVec tmp = fft.fwd(x);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(fft.fwd(x));
+  }
+  state.SetItemsProcessed(state.iterations() * n);
+  state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);  // read + write
+}
+
+BENCHMARK(BM_GpuFFT_1D_C2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
+
+// --------------------------------------------------------------------------
+// 1D C2C Inverse
+// --------------------------------------------------------------------------
+
+static void BM_GpuFFT_1D_C2C_Inv(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  CVec x = CVec::Random(n);
+  GpuFFT<Scalar> fft;
+  CVec X = fft.fwd(x);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(fft.inv(X));
+  }
+  state.SetItemsProcessed(state.iterations() * n);
+  state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);
+}
+
+BENCHMARK(BM_GpuFFT_1D_C2C_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
+
+// --------------------------------------------------------------------------
+// 1D R2C Forward
+// --------------------------------------------------------------------------
+
+static void BM_GpuFFT_1D_R2C_Fwd(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  RVec r = RVec::Random(n);
+  GpuFFT<Scalar> fft;
+
+  // Warm up plan.
+  CVec tmp = fft.fwd(r);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(fft.fwd(r));
+  }
+  state.SetItemsProcessed(state.iterations() * n);
+  state.SetBytesProcessed(state.iterations() * (n * sizeof(Scalar) + (n / 2 + 1) * sizeof(Complex)));
+}
+
+BENCHMARK(BM_GpuFFT_1D_R2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
+
+// --------------------------------------------------------------------------
+// 1D C2R Inverse
+// --------------------------------------------------------------------------
+
+static void BM_GpuFFT_1D_C2R_Inv(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  RVec r = RVec::Random(n);
+  GpuFFT<Scalar> fft;
+  CVec R = fft.fwd(r);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(fft.invReal(R, n));
+  }
+  state.SetItemsProcessed(state.iterations() * n);
+  state.SetBytesProcessed(state.iterations() * ((n / 2 + 1) * sizeof(Complex) + n * sizeof(Scalar)));
+}
+
+BENCHMARK(BM_GpuFFT_1D_C2R_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
+
+// --------------------------------------------------------------------------
+// 2D C2C Forward
+// --------------------------------------------------------------------------
+
+static void BM_GpuFFT_2D_C2C_Fwd(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);  // square n x n
+  CMat A = CMat::Random(n, n);
+  GpuFFT<Scalar> fft;
+
+  // Warm up plan.
+  CMat tmp = fft.fwd2d(A);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(fft.fwd2d(A));
+  }
+  state.SetItemsProcessed(state.iterations() * n * n);
+  state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 2);
+}
+
+BENCHMARK(BM_GpuFFT_2D_C2C_Fwd)->RangeMultiplier(2)->Range(64, 4096);
+
+// --------------------------------------------------------------------------
+// 2D C2C Roundtrip (fwd + inv)
+// --------------------------------------------------------------------------
+
+static void BM_GpuFFT_2D_C2C_Roundtrip(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  CMat A = CMat::Random(n, n);
+  GpuFFT<Scalar> fft;
+
+  // Warm up plans.
+  CMat tmp = fft.inv2d(fft.fwd2d(A));
+
+  for (auto _ : state) {
+    CMat B = fft.fwd2d(A);
+    benchmark::DoNotOptimize(fft.inv2d(B));
+  }
+  state.SetItemsProcessed(state.iterations() * n * n * 2);  // fwd + inv
+  state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 4);
+}
+
+BENCHMARK(BM_GpuFFT_2D_C2C_Roundtrip)->RangeMultiplier(2)->Range(64, 4096);
+
+// --------------------------------------------------------------------------
+// 1D Cold start (includes plan creation)
+// --------------------------------------------------------------------------
+
+static void BM_GpuFFT_1D_ColdStart(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  CVec x = CVec::Random(n);
+
+  for (auto _ : state) {
+    GpuFFT<Scalar> fft;  // new object = new plans
+    benchmark::DoNotOptimize(fft.fwd(x));
+  }
+  state.SetItemsProcessed(state.iterations() * n);
+}
+
+BENCHMARK(BM_GpuFFT_1D_ColdStart)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
--- a/benchmarks/GPU/bench_gpu_solvers.cpp
+++ b/benchmarks/GPU/bench_gpu_solvers.cpp
@@ -0,0 +1,296 @@
+// GPU solver benchmarks: GpuLLT and GpuLU compute + solve throughput.
+//
+// Measures factorization and solve performance for the host-matrix and
+// DeviceMatrix code paths across a range of matrix sizes.
+//
+// For Nsight Systems profiling:
+//   nsys profile --trace=cuda,nvtx ./bench_gpu_solvers
+//
+// For Nsight Compute kernel analysis:
+//   ncu --set full -o profile ./bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
+
+#include <benchmark/benchmark.h>
+
+#include <Eigen/Cholesky>
+#include <Eigen/GPU>
+#include <Eigen/LU>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR double
+#endif
+
+using Scalar = SCALAR;
+using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+
+// --------------------------------------------------------------------------
+// Helpers
+// --------------------------------------------------------------------------
+
+static Mat make_spd(Index n) {
+  Mat M = Mat::Random(n, n);
+  return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
+}
+
+// CUDA warm-up: ensure the GPU is initialized before timing.
+static void cuda_warmup() {
+  static bool done = false;
+  if (!done) {
+    void* p;
+    cudaMalloc(&p, 1);
+    cudaFree(p);
+    done = true;
+  }
+}
+
+// --------------------------------------------------------------------------
+// GpuLLT benchmarks
+// --------------------------------------------------------------------------
+
+// Factorize from host matrix (includes H2D upload).
+static void BM_GpuLLT_Compute_Host(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  Mat A = make_spd(n);
+  GpuLLT<Scalar> llt;
+
+  for (auto _ : state) {
+    llt.compute(A);
+    if (llt.info() != Success) state.SkipWithError("factorization failed");
+  }
+
+  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["n"] = n;
+}
+
+// Factorize from DeviceMatrix (D2D copy path).
+static void BM_GpuLLT_Compute_Device(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  Mat A = make_spd(n);
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  GpuLLT<Scalar> llt;
+
+  for (auto _ : state) {
+    llt.compute(d_A);
+    if (llt.info() != Success) state.SkipWithError("factorization failed");
+  }
+
+  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["n"] = n;
+}
+
+// Factorize from DeviceMatrix (move path, no copy).
+static void BM_GpuLLT_Compute_DeviceMove(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  Mat A = make_spd(n);
+  GpuLLT<Scalar> llt;
+
+  for (auto _ : state) {
+    auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+    llt.compute(std::move(d_A));
+    if (llt.info() != Success) state.SkipWithError("factorization failed");
+  }
+
+  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["n"] = n;
+}
+
+// Solve from host matrix (H2D + potrs + D2H).
+static void BM_GpuLLT_Solve_Host(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const Index nrhs = state.range(1);
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, nrhs);
+  GpuLLT<Scalar> llt(A);
+
+  for (auto _ : state) {
+    Mat X = llt.solve(B);
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["nrhs"] = nrhs;
+}
+
+// Solve from DeviceMatrix (D2D + potrs, async, toHost at end).
+static void BM_GpuLLT_Solve_Device(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const Index nrhs = state.range(1);
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, nrhs);
+  GpuLLT<Scalar> llt(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  for (auto _ : state) {
+    DeviceMatrix<Scalar> d_X = llt.solve(d_B);
+    Mat X = d_X.toHost();
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["nrhs"] = nrhs;
+}
+
+// Solve staying entirely on device (no toHost — measures pure GPU time).
+static void BM_GpuLLT_Solve_DeviceOnly(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const Index nrhs = state.range(1);
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, nrhs);
+  GpuLLT<Scalar> llt(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  for (auto _ : state) {
+    DeviceMatrix<Scalar> d_X = llt.solve(d_B);
+    // Force completion without D2H transfer.
+    cudaStreamSynchronize(llt.stream());
+    benchmark::DoNotOptimize(d_X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["nrhs"] = nrhs;
+}
+
+// --------------------------------------------------------------------------
+// GpuLU benchmarks
+// --------------------------------------------------------------------------
+
+static void BM_GpuLU_Compute_Host(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  Mat A = Mat::Random(n, n);
+  GpuLU<Scalar> lu;
+
+  for (auto _ : state) {
+    lu.compute(A);
+    if (lu.info() != Success) state.SkipWithError("factorization failed");
+  }
+
+  double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["n"] = n;
+}
+
+static void BM_GpuLU_Compute_Device(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  Mat A = Mat::Random(n, n);
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  GpuLU<Scalar> lu;
+
+  for (auto _ : state) {
+    lu.compute(d_A);
+    if (lu.info() != Success) state.SkipWithError("factorization failed");
+  }
+
+  double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["n"] = n;
+}
+
+static void BM_GpuLU_Solve_Host(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const Index nrhs = state.range(1);
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, nrhs);
+  GpuLU<Scalar> lu(A);
+
+  for (auto _ : state) {
+    Mat X = lu.solve(B);
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["nrhs"] = nrhs;
+}
+
+static void BM_GpuLU_Solve_Device(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const Index nrhs = state.range(1);
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, nrhs);
+  GpuLU<Scalar> lu(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  for (auto _ : state) {
+    DeviceMatrix<Scalar> d_X = lu.solve(d_B);
+    Mat X = d_X.toHost();
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["nrhs"] = nrhs;
+}
+
+// --------------------------------------------------------------------------
+// CPU baselines for comparison
+// --------------------------------------------------------------------------
+
+static void BM_CpuLLT_Compute(benchmark::State& state) {
+  const Index n = state.range(0);
+  Mat A = make_spd(n);
+  LLT<Mat> llt;
+
+  for (auto _ : state) {
+    llt.compute(A);
+    benchmark::DoNotOptimize(llt.matrixLLT().data());
+  }
+
+  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["n"] = n;
+}
+
+static void BM_CpuLU_Compute(benchmark::State& state) {
+  const Index n = state.range(0);
+  Mat A = Mat::Random(n, n);
+  PartialPivLU<Mat> lu;
+
+  for (auto _ : state) {
+    lu.compute(A);
+    benchmark::DoNotOptimize(lu.matrixLU().data());
+  }
+
+  double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["n"] = n;
+}
+
+// --------------------------------------------------------------------------
+// Registration
+// --------------------------------------------------------------------------
+
+// clang-format off
+BENCHMARK(BM_GpuLLT_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLLT_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLLT_Compute_DeviceMove)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLLT_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLLT_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLLT_Solve_DeviceOnly)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_GpuLU_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLU_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLU_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLU_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_CpuLLT_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_CpuLU_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
+// clang-format on
--- a/ci/build.linux.gitlab-ci.yml
+++ b/ci/build.linux.gitlab-ci.yml
@@ -197,7 +197,7 @@ build:linux:x86-64:nvhpc-26.1:default:unsupported:
    # Additional flags passed to the cuda compiler.
    EIGEN_CI_CUDA_CXX_FLAGS: ""
    # Compute architectures present in the GitLab CI runners.
-    EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
+    EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75"
    EIGEN_CI_BUILD_TARGET: buildtests_gpu
    EIGEN_CI_TEST_CUDA_CLANG: "off"
    EIGEN_CI_TEST_CUDA_NVC: "off"
@@ -211,20 +211,20 @@ build:linux:x86-64:nvhpc-26.1:default:unsupported:
    # Build on regular linux to limit GPU cost.
    - saas-linux-2xlarge-amd64

-# GCC-10, CUDA-12.2
-build:linux:cuda-12.2:gcc-10:
+# GCC-11, CUDA-12.2
+build:linux:cuda-12.2:gcc-11:
  extends: .build:linux:cuda
-  image: nvidia/cuda:12.2.0-devel-ubuntu20.04
+  image: nvidia/cuda:12.2.0-devel-ubuntu22.04
  variables:
-    EIGEN_CI_C_COMPILER: gcc-10
-    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_C_COMPILER: gcc-11
+    EIGEN_CI_CXX_COMPILER: g++-11

-# Clang-12, CUDA-12.2
-build:linux:cuda-12.2:clang-12:
-  extends: build:linux:cuda-12.2:gcc-10
+# Clang-14, CUDA-12.2
+build:linux:cuda-12.2:clang-14:
+  extends: build:linux:cuda-12.2:gcc-11
  variables:
-    EIGEN_CI_C_COMPILER: clang-12
-    EIGEN_CI_CXX_COMPILER: clang++-12
+    EIGEN_CI_C_COMPILER: clang-14
+    EIGEN_CI_CXX_COMPILER: clang++-14
    EIGEN_CI_TEST_CUDA_CLANG: "on"


@@ -234,7 +234,7 @@ build:linux:cuda-12.2:clang-12:
 # ROCm HIP
 build:linux:rocm-latest:gcc-10:
  extends: .build:linux:cross
-  image: rocm/dev-ubuntu-24.04:latest
+  image: rocm/dev-ubuntu-24.04:6.3.1
  variables:
    EIGEN_CI_C_COMPILER: gcc-10
    EIGEN_CI_CXX_COMPILER: g++-10
--- a/ci/build.windows.gitlab-ci.yml
+++ b/ci/build.windows.gitlab-ci.yml
@@ -55,7 +55,7 @@ build:windows:x86-64:msvc-14.29:avx512dq:
  extends: .build:windows
  variables:
    # Compute architectures present in the GitLab CI runners.
-    EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
+    EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75"
    EIGEN_CI_BUILD_TARGET: buildtests_gpu
    EIGEN_CI_ADDITIONAL_ARGS:
      -DEIGEN_TEST_CUDA=on
@@ -66,8 +66,8 @@ build:windows:x86-64:msvc-14.29:avx512dq:
    - x86-64
    - cuda

-# MSVC 14.29 + CUDA 11.4
-build:windows:x86-64:cuda-11.4:msvc-14.29:
+# MSVC 14.29 + CUDA 12.2
+build:windows:x86-64:cuda-12.2:msvc-14.29:
  extends: .build:windows:cuda
  variables:
-    EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V11_4
+    EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V12_2
--- a/ci/test.linux.gitlab-ci.yml
+++ b/ci/test.linux.gitlab-ci.yml
@@ -265,23 +265,23 @@ test:linux:x86-64:nvhpc-26.1:default:unsupported:
  tags:
    - saas-linux-medium-amd64-gpu-standard

-# GCC-10, CUDA-12.2
-test:linux:cuda-12.2:gcc-10:
+# GCC-11, CUDA-12.2
+test:linux:cuda-12.2:gcc-11:
  extends: .test:linux:cuda
-  image: nvidia/cuda:12.2.0-devel-ubuntu20.04
-  needs: [ build:linux:cuda-12.2:gcc-10 ]
+  image: nvidia/cuda:12.2.0-devel-ubuntu22.04
+  needs: [ build:linux:cuda-12.2:gcc-11 ]
  variables:
-    EIGEN_CI_CXX_COMPILER: g++-10
-    EIGEN_CI_CC_COMPILER: gcc-10
+    EIGEN_CI_CXX_COMPILER: g++-11
+    EIGEN_CI_CC_COMPILER: gcc-11

-# Clang-12, CUDA-12.2
-test:linux:cuda-12.2:clang-12:
+# Clang-14, CUDA-12.2
+test:linux:cuda-12.2:clang-14:
  extends: .test:linux:cuda
-  image: nvidia/cuda:12.2.0-devel-ubuntu20.04
-  needs: [ build:linux:cuda-12.2:clang-12 ]
+  image: nvidia/cuda:12.2.0-devel-ubuntu22.04
+  needs: [ build:linux:cuda-12.2:clang-14 ]
  variables:
-    EIGEN_CI_CXX_COMPILER: clang++-12
-    EIGEN_CI_CC_COMPILER: clang-12
+    EIGEN_CI_CXX_COMPILER: clang++-14
+    EIGEN_CI_CC_COMPILER: clang-14


 ##### arm ######################################################################
--- a/ci/test.windows.gitlab-ci.yml
+++ b/ci/test.windows.gitlab-ci.yml
@@ -71,7 +71,7 @@ test:windows:x86-64:msvc-14.29:avx512dq:unsupported:
    - x86-64
    - cuda

-# MSVC 14.29 + CUDA 11.4
-test:windows:x86-64:cuda-11.4:msvc-14.29:
+# MSVC 14.29 + CUDA 12.2
+test:windows:x86-64:cuda-12.2:msvc-14.29:
  extends: .test:windows:cuda
-  needs: [ build:windows:x86-64:cuda-11.4:msvc-14.29 ]
+  needs: [ build:windows:x86-64:cuda-12.2:msvc-14.29 ]
--- a/cmake/EigenConfigureTesting.cmake
+++ b/cmake/EigenConfigureTesting.cmake
@@ -20,7 +20,8 @@ add_dependencies(check buildtests)

 # Convenience target for only building GPU tests.
 add_custom_target(buildtests_gpu)
-add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure"
+add_custom_target(check_gpu COMMAND "ctest" ${EIGEN_CTEST_ARGS}
+                                            "--output-on-failure"
                                            "--no-compress-output"
                                            "--build-no-clean"
                                            "-T" "test"
@@ -71,4 +72,3 @@ elseif(MSVC)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS")
 endif()

-
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -8,6 +8,12 @@ macro(ei_add_property prop value)
  endif()
 endmacro()

+if(EIGEN_TEST_HIP AND NOT DEFINED EIGEN_HIP_ARCHITECTURES)
+  set(EIGEN_HIP_ARCHITECTURES
+      gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151
+      CACHE STRING "HIP GPU architectures to build Eigen's HIP tests for.")
+endif()
+
 #internal. See documentation of ei_add_test for details.
 macro(ei_add_test_internal testname testname_with_suffix)
  set(targetname ${testname_with_suffix})
@@ -30,7 +36,7 @@ macro(ei_add_test_internal testname testname_with_suffix)
      hip_reset_flags()
      hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS -std=c++14)
      target_compile_definitions(${targetname} PRIVATE -DEIGEN_USE_HIP)
-      set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+      set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES "${EIGEN_HIP_ARCHITECTURES}")
    elseif(EIGEN_TEST_CUDA_CLANG)
      set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)

@@ -134,6 +140,7 @@ macro(ei_add_test_internal testname testname_with_suffix)
  if (is_gpu_test)
    # Add gpu tag for testing only GPU tests.
    set_property(TEST ${testname_with_suffix} APPEND PROPERTY LABELS "gpu")
+    set_property(TEST ${testname_with_suffix} PROPERTY SKIP_RETURN_CODE 77)
  endif()

  if(EIGEN_SYCL)
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -433,7 +433,7 @@ if(EIGEN_TEST_CUDA_NVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "NVHPC")
  message(WARNING "EIGEN_TEST_CUDA_NVC is set, but CMAKE_CXX_COMPILER does not appear to be nvc++.")
 endif()

-find_package(CUDA 9.0)
+find_package(CUDA 11.4)
 if(CUDA_FOUND AND EIGEN_TEST_CUDA)
  # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
  # and -fno-check-new flags since they trigger thousands of compilation warnings
@@ -479,6 +479,153 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)

  ei_add_test(gpu_example)
  ei_add_test(gpu_basic)
+  ei_add_test(gpu_library_example "" "CUDA::cusolver")
+
+  # DeviceMatrix tests: only CUDA runtime, no NVIDIA libraries.
+  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+  add_executable(gpu_device_matrix gpu_device_matrix.cpp)
+  target_include_directories(gpu_device_matrix PRIVATE
+    "${CUDA_TOOLKIT_ROOT_DIR}/include"
+    "${CMAKE_CURRENT_BINARY_DIR}")
+  target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart)
+  target_compile_definitions(gpu_device_matrix PRIVATE
+    EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
+    EIGEN_TEST_PART_ALL=1)
+  add_test(NAME gpu_device_matrix COMMAND gpu_device_matrix)
+  add_dependencies(buildtests gpu_device_matrix)
+  add_dependencies(buildtests_gpu gpu_device_matrix)
+  set_property(TEST gpu_device_matrix APPEND PROPERTY LABELS "Official;gpu")
+  set_property(TEST gpu_device_matrix PROPERTY SKIP_RETURN_CODE 77)
+  set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+
+  # Library-specific GPU tests (activated by later phases, OFF by default).
+  # CUDAToolkit imported targets (CUDA::cublas, etc.) are available from
+  # find_package(CUDAToolkit) above.
+  option(EIGEN_TEST_CUBLAS "Test cuBLAS integration" OFF)
+  if(EIGEN_TEST_CUBLAS AND TARGET CUDA::cublas)
+    # cuBLAS tests are plain .cpp files (no device code), like cuSOLVER tests.
+    unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+    add_executable(gpu_cublas gpu_cublas.cpp)
+    target_include_directories(gpu_cublas PRIVATE
+      "${CUDA_TOOLKIT_ROOT_DIR}/include"
+      "${CMAKE_CURRENT_BINARY_DIR}")
+    target_link_libraries(gpu_cublas
+      Eigen3::Eigen CUDA::cudart CUDA::cublas CUDA::cusolver)
+    target_compile_definitions(gpu_cublas PRIVATE
+      EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
+      EIGEN_TEST_PART_ALL=1)
+    add_test(NAME gpu_cublas COMMAND gpu_cublas)
+    add_dependencies(buildtests gpu_cublas)
+    add_dependencies(buildtests_gpu gpu_cublas)
+    set_property(TEST gpu_cublas APPEND PROPERTY LABELS "Official;gpu")
+    set_property(TEST gpu_cublas PROPERTY SKIP_RETURN_CODE 77)
+    set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+  endif()
+
+  option(EIGEN_TEST_CUSOLVER "Test cuSOLVER integration" OFF)
+  if(EIGEN_TEST_CUSOLVER AND TARGET CUDA::cusolver)
+    # cuSOLVER tests are plain .cpp files: no device code, compiled by the host
+    # compiler and linked against CUDA runtime + cuSOLVER. This avoids NVCC
+    # instantiating Eigen's CPU packet operations for CUDA vector types.
+    unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+    foreach(_cusolver_test IN ITEMS gpu_cusolver_llt gpu_cusolver_lu gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen)
+      add_executable(${_cusolver_test} ${_cusolver_test}.cpp)
+      target_include_directories(${_cusolver_test} PRIVATE
+        "${CUDA_TOOLKIT_ROOT_DIR}/include"
+        "${CMAKE_CURRENT_BINARY_DIR}")
+      target_link_libraries(${_cusolver_test}
+        Eigen3::Eigen CUDA::cudart CUDA::cusolver CUDA::cublas)
+      target_compile_definitions(${_cusolver_test} PRIVATE
+        EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
+        EIGEN_TEST_PART_ALL=1)
+      add_test(NAME ${_cusolver_test} COMMAND "${_cusolver_test}")
+      add_dependencies(buildtests ${_cusolver_test})
+      add_dependencies(buildtests_gpu ${_cusolver_test})
+      set_property(TEST ${_cusolver_test} APPEND PROPERTY LABELS "Official;gpu")
+      set_property(TEST ${_cusolver_test} PROPERTY SKIP_RETURN_CODE 77)
+    endforeach()
+    set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+  endif()
+
+  # cuFFT test (cuFFT is part of the CUDA toolkit — no separate option needed).
+  if(TARGET CUDA::cufft)
+    unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+    add_executable(gpu_cufft gpu_cufft.cpp)
+    target_include_directories(gpu_cufft PRIVATE
+      "${CUDA_TOOLKIT_ROOT_DIR}/include"
+      "${CMAKE_CURRENT_BINARY_DIR}")
+    target_link_libraries(gpu_cufft
+      Eigen3::Eigen CUDA::cudart CUDA::cufft CUDA::cublas)
+    target_compile_definitions(gpu_cufft PRIVATE
+      EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
+      EIGEN_TEST_PART_ALL=1)
+    add_test(NAME gpu_cufft COMMAND gpu_cufft)
+    add_dependencies(buildtests gpu_cufft)
+    add_dependencies(buildtests_gpu gpu_cufft)
+    set_property(TEST gpu_cufft APPEND PROPERTY LABELS "Official;gpu")
+    set_property(TEST gpu_cufft PROPERTY SKIP_RETURN_CODE 77)
+    set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+  endif()
+
+  # cuSPARSE SpMV test (cuSPARSE is part of the CUDA toolkit).
+  if(TARGET CUDA::cusparse)
+    unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+    add_executable(gpu_cusparse_spmv gpu_cusparse_spmv.cpp)
+    target_include_directories(gpu_cusparse_spmv PRIVATE
+      "${CUDA_TOOLKIT_ROOT_DIR}/include"
+      "${CMAKE_CURRENT_BINARY_DIR}")
+    target_link_libraries(gpu_cusparse_spmv
+      Eigen3::Eigen CUDA::cudart CUDA::cusparse)
+    target_compile_definitions(gpu_cusparse_spmv PRIVATE
+      EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
+      EIGEN_TEST_PART_ALL=1)
+    add_test(NAME gpu_cusparse_spmv COMMAND gpu_cusparse_spmv)
+    add_dependencies(buildtests gpu_cusparse_spmv)
+    add_dependencies(buildtests_gpu gpu_cusparse_spmv)
+    set_property(TEST gpu_cusparse_spmv APPEND PROPERTY LABELS "Official;gpu")
+    set_property(TEST gpu_cusparse_spmv PROPERTY SKIP_RETURN_CODE 77)
+    set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+  endif()
+
+  option(EIGEN_TEST_CUSPARSE "Test cuSPARSE integration" OFF)
+  if(EIGEN_TEST_CUSPARSE AND TARGET CUDA::cusparse)
+    ei_add_test(gpu_cusparse "" "CUDA::cusparse")
+  endif()
+
+  # cuDSS sparse direct solver tests.
+  # cuDSS is distributed separately from the CUDA Toolkit.
+  option(EIGEN_TEST_CUDSS "Test cuDSS sparse solver integration" OFF)
+  if(EIGEN_TEST_CUDSS)
+    find_path(CUDSS_INCLUDE_DIR cudss.h
+      HINTS ${CUDSS_DIR}/include ${CUDA_TOOLKIT_ROOT_DIR}/include /usr/include)
+    find_library(CUDSS_LIBRARY cudss
+      HINTS ${CUDSS_DIR}/lib ${CUDSS_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib/x86_64-linux-gnu)
+    if(CUDSS_INCLUDE_DIR AND CUDSS_LIBRARY)
+      message(STATUS "cuDSS found: ${CUDSS_LIBRARY}")
+      unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+      foreach(_cudss_test IN ITEMS gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu)
+        add_executable(${_cudss_test} ${_cudss_test}.cpp)
+        target_include_directories(${_cudss_test} PRIVATE
+          "${CUDA_TOOLKIT_ROOT_DIR}/include"
+          "${CUDSS_INCLUDE_DIR}"
+          "${CMAKE_CURRENT_BINARY_DIR}")
+        target_link_libraries(${_cudss_test}
+          Eigen3::Eigen CUDA::cudart CUDA::cusolver CUDA::cublas ${CUDSS_LIBRARY})
+        target_compile_definitions(${_cudss_test} PRIVATE
+          EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
+          EIGEN_TEST_PART_ALL=1
+          EIGEN_CUDSS=1)
+        add_test(NAME ${_cudss_test} COMMAND "${_cudss_test}")
+        add_dependencies(buildtests ${_cudss_test})
+        add_dependencies(buildtests_gpu ${_cudss_test})
+        set_property(TEST ${_cudss_test} APPEND PROPERTY LABELS "Official;gpu")
+        set_property(TEST ${_cudss_test} PROPERTY SKIP_RETURN_CODE 77)
+      endforeach()
+      set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+    else()
+      message(WARNING "EIGEN_TEST_CUDSS=ON but cuDSS not found. Set CUDSS_DIR.")
+    endif()
+  endif()

  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)

@@ -502,6 +649,9 @@ if (EIGEN_TEST_HIP)
  endif()

  find_package(HIP REQUIRED)
+  if (HIP_FOUND AND HIP_VERSION VERSION_LESS "5.6")
+    message(FATAL_ERROR "Eigen requires ROCm/HIP >= 5.6, found ${HIP_VERSION}")
+  endif()
  if (HIP_FOUND)
    execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)

--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu
@@ -7,12 +7,6 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

-// workaround issue between gcc >= 4.7 and cuda 5.5
-#if (defined __GNUC__) && (__GNUC__ > 4 || __GNUC_MINOR__ >= 7)
-#undef _GLIBCXX_ATOMIC_BUILTINS
-#undef _GLIBCXX_USE_INT128
-#endif
-
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int

--- a/test/gpu_context.h
+++ b/test/gpu_context.h
@@ -0,0 +1,72 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TEST_GPU_CONTEXT_H
+#define EIGEN_TEST_GPU_CONTEXT_H
+
+// RAII context for GPU tests that use NVIDIA library APIs (cuBLAS, cuSOLVER, etc.).
+// Owns a non-default CUDA stream. Library handles (cuBLAS, cuSOLVER, etc.) are added
+// here by each integration phase as needed; each handle is bound to the owned stream.
+//
+// Usage:
+//   GpuContext ctx;
+//   auto buf = gpu_copy_to_device(ctx.stream, A);
+//   // ... call NVIDIA library APIs using ctx.stream / ctx.cusolver ...
+//   ctx.synchronize();
+
+#include "gpu_test_helper.h"
+
+#ifdef EIGEN_USE_GPU
+#include <cusolverDn.h>
+
+// Checks cuSOLVER return codes, aborts on failure.
+#define CUSOLVER_CHECK(expr)                                                                 \
+  do {                                                                                       \
+    cusolverStatus_t _status = (expr);                                                       \
+    if (_status != CUSOLVER_STATUS_SUCCESS) {                                                \
+      printf("cuSOLVER error %d at %s:%d\n", static_cast<int>(_status), __FILE__, __LINE__); \
+      gpu_assert(false);                                                                     \
+    }                                                                                        \
+  } while (0)
+
+struct GpuContext {
+  cudaStream_t stream = nullptr;
+  cusolverDnHandle_t cusolver = nullptr;
+
+  GpuContext() {
+    GPU_CHECK(gpuGetDevice(&device_));
+    GPU_CHECK(gpuGetDeviceProperties(&device_props_, device_));
+    GPU_CHECK(cudaStreamCreate(&stream));
+    CUSOLVER_CHECK(cusolverDnCreate(&cusolver));
+    CUSOLVER_CHECK(cusolverDnSetStream(cusolver, stream));
+  }
+
+  ~GpuContext() {
+    if (cusolver) CUSOLVER_CHECK(cusolverDnDestroy(cusolver));
+    if (stream) GPU_CHECK(cudaStreamDestroy(stream));
+  }
+
+  int device() const { return device_; }
+  const gpuDeviceProp_t& deviceProperties() const { return device_props_; }
+
+  // Wait for all work submitted on this context's stream to complete.
+  void synchronize() { GPU_CHECK(cudaStreamSynchronize(stream)); }
+
+  // Non-copyable, non-movable.
+  GpuContext(const GpuContext&) = delete;
+  GpuContext& operator=(const GpuContext&) = delete;
+
+ private:
+  int device_ = 0;
+  gpuDeviceProp_t device_props_;
+};
+
+#endif  // EIGEN_USE_GPU
+
+#endif  // EIGEN_TEST_GPU_CONTEXT_H
--- a/test/gpu_cublas.cpp
+++ b/test/gpu_cublas.cpp
@@ -0,0 +1,756 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for cuBLAS GEMM dispatch via DeviceMatrix expression syntax.
+// Covers: d_C = d_A * d_B, adjoint, transpose, scaled, +=, .device(ctx).
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+// Unit roundoff for GPU GEMM compute precision.
+// TF32 (opt-in via EIGEN_CUDA_TF32) has eps ~ 2^{-10}.
+template <typename Scalar>
+typename NumTraits<Scalar>::Real gpu_unit_roundoff() {
+#if defined(EIGEN_CUDA_TF32) && !defined(EIGEN_NO_CUDA_TENSOR_OPS)
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  if (std::is_same<RealScalar, float>::value) return RealScalar(9.8e-4);
+#endif
+  return NumTraits<Scalar>::epsilon();
+}
+
+// Higham-Mary probabilistic error bound for GEMM:
+//   ||C - fl(C)||_F <= lambda * sqrt(k) * u * ||A||_F * ||B||_F
+// where k is the inner dimension, u is the unit roundoff, and
+// lambda = sqrt(2 * ln(2/delta)) with delta = failure probability.
+// lambda = 5 corresponds to delta ~ 10^{-6}.
+// Reference: Higham & Mary, "Probabilistic Error Analysis for Inner Products",
+// SIAM J. Matrix Anal. Appl., 2019.
+template <typename Scalar>
+typename NumTraits<Scalar>::Real gemm_error_bound(Index k, typename NumTraits<Scalar>::Real normA,
+                                                  typename NumTraits<Scalar>::Real normB) {
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  constexpr RealScalar lambda = 5;
+  return lambda * std::sqrt(static_cast<RealScalar>(k)) * gpu_unit_roundoff<Scalar>() * normA * normB;
+}
+
+// ---- Basic GEMM: C = A * B -------------------------------------------------
+
+template <typename Scalar>
+void test_gemm_basic(Index m, Index n, Index k) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  // Expression: d_C = d_A * d_B
+  DeviceMatrix<Scalar> d_C;
+  d_C = d_A * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A * B;
+
+  RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM with adjoint: C = A^H * B ----------------------------------------
+
+template <typename Scalar>
+void test_gemm_adjoint_lhs(Index m, Index n, Index k) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(k, m);  // A is k×m, A^H is m×k
+  Mat B = Mat::Random(k, n);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  DeviceMatrix<Scalar> d_C;
+  d_C = d_A.adjoint() * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A.adjoint() * B;
+
+  RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM with transpose: C = A * B^T --------------------------------------
+
+template <typename Scalar>
+void test_gemm_transpose_rhs(Index m, Index n, Index k) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(n, k);  // B is n×k, B^T is k×n
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  DeviceMatrix<Scalar> d_C;
+  d_C = d_A * d_B.transpose();
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A * B.transpose();
+
+  RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM with scaled: C = alpha * A * B ------------------------------------
+
+template <typename Scalar>
+void test_gemm_scaled(Index m, Index n, Index k) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+  Scalar alpha = Scalar(2.5);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  DeviceMatrix<Scalar> d_C;
+  d_C = alpha * d_A * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = alpha * A * B;
+
+  RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM accumulate: C += A * B (beta=1) -----------------------------------
+
+template <typename Scalar>
+void test_gemm_accumulate(Index m, Index n, Index k) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+  Mat C_init = Mat::Random(m, n);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+  auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
+
+  d_C += d_A * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = C_init + A * B;
+
+  RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM accumulate into empty destination ---------------------------------
+
+template <typename Scalar>
+void test_gemm_accumulate_empty(Index m, Index n, Index k) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+  DeviceMatrix<Scalar> d_C;
+
+  d_C += d_A * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A * B;
+
+  RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM subtract: C -= A * B (beta=1, alpha=-1) --------------------------
+
+template <typename Scalar>
+void test_gemm_subtract(Index m, Index n, Index k) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+  Mat C_init = Mat::Random(m, n);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+  auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
+
+  GpuContext ctx;
+  d_C.device(ctx) -= d_A * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = C_init - A * B;
+
+  RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM subtract from empty destination -----------------------------------
+
+template <typename Scalar>
+void test_gemm_subtract_empty(Index m, Index n, Index k) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  GpuContext ctx;
+  DeviceMatrix<Scalar> d_C;
+  d_C.device(ctx) -= d_A * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = -(A * B);
+
+  RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM with scaled RHS: C = A * (alpha * B) -----------------------------
+
+template <typename Scalar>
+void test_gemm_scaled_rhs(Index m, Index n, Index k) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+  Scalar alpha = Scalar(3.0);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  DeviceMatrix<Scalar> d_C;
+  d_C = d_A * (alpha * d_B);
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A * (alpha * B);
+
+  RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM dimension mismatch must assert ------------------------------------
+
+template <typename Scalar>
+void test_gemm_dimension_mismatch() {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+
+  Mat A = Mat::Random(8, 5);
+  Mat B = Mat::Random(6, 7);  // inner dimension mismatch
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+  DeviceMatrix<Scalar> d_C;
+
+  VERIFY_RAISES_ASSERT(d_C = d_A * d_B);
+}
+
+// ---- GEMM with explicit GpuContext ------------------------------------------
+
+template <typename Scalar>
+void test_gemm_explicit_context(Index m, Index n, Index k) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  GpuContext ctx;
+  DeviceMatrix<Scalar> d_C;
+  d_C.device(ctx) = d_A * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A * B;
+
+  RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM cross-context reuse of the same destination -----------------------
+
+template <typename Scalar>
+void test_gemm_cross_context_reuse(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, n);
+  Mat D = Mat::Random(n, n);
+  Mat E = Mat::Random(n, n);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+  auto d_D = DeviceMatrix<Scalar>::fromHost(D);
+  auto d_E = DeviceMatrix<Scalar>::fromHost(E);
+
+  GpuContext ctx1;
+  GpuContext ctx2;
+  DeviceMatrix<Scalar> d_C;
+  d_C.device(ctx1) = d_A * d_B;
+  d_C.device(ctx2) += d_D * d_E;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A * B + D * E;
+
+  RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm()) + gemm_error_bound<Scalar>(n, D.norm(), E.norm());
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM cross-context resize of the destination ---------------------------
+
+template <typename Scalar>
+void test_gemm_cross_context_resize() {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(64, 64);
+  Mat B = Mat::Random(64, 64);
+  Mat D = Mat::Random(32, 16);
+  Mat E = Mat::Random(16, 8);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+  auto d_D = DeviceMatrix<Scalar>::fromHost(D);
+  auto d_E = DeviceMatrix<Scalar>::fromHost(E);
+
+  GpuContext ctx1;
+  GpuContext ctx2;
+  DeviceMatrix<Scalar> d_C;
+  d_C.device(ctx1) = d_A * d_B;
+  d_C.device(ctx2) = d_D * d_E;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = D * E;
+
+  RealScalar tol = gemm_error_bound<Scalar>(16, D.norm(), E.norm());
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM chaining: C = (A * B) then D = C * E -----------------------------
+
+template <typename Scalar>
+void test_gemm_chain(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, n);
+  Mat E = Mat::Random(n, n);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+  auto d_E = DeviceMatrix<Scalar>::fromHost(E);
+
+  DeviceMatrix<Scalar> d_C;
+  d_C = d_A * d_B;
+  DeviceMatrix<Scalar> d_D;
+  d_D = d_C * d_E;
+
+  Mat D = d_D.toHost();
+  Mat D_ref = (A * B) * E;
+
+  Mat C_ref = A * B;
+  RealScalar tol =
+      gemm_error_bound<Scalar>(n, A.norm(), B.norm()) * E.norm() + gemm_error_bound<Scalar>(n, C_ref.norm(), E.norm());
+  VERIFY((D - D_ref).norm() < tol);
+}
+
+// ---- Square identity check: A * I = A ---------------------------------------
+
+template <typename Scalar>
+void test_gemm_identity(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+
+  Mat A = Mat::Random(n, n);
+  Mat eye = Mat::Identity(n, n);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_I = DeviceMatrix<Scalar>::fromHost(eye);
+
+  DeviceMatrix<Scalar> d_C;
+  d_C = d_A * d_I;
+
+  Mat C = d_C.toHost();
+  VERIFY_IS_APPROX(C, A);
+}
+
+// ---- LLT solve expression: d_X = d_A.llt().solve(d_B) ----------------------
+
+template <typename MatrixType>
+MatrixType make_spd(Index n) {
+  using Scalar = typename MatrixType::Scalar;
+  MatrixType M = MatrixType::Random(n, n);
+  return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
+}
+
+template <typename Scalar>
+void test_llt_solve_expr(Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = make_spd<Mat>(n);
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  DeviceMatrix<Scalar> d_X;
+  d_X = d_A.llt().solve(d_B);
+
+  Mat X = d_X.toHost();
+  RealScalar residual = (A * X - B).norm() / B.norm();
+  VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
+}
+
+// ---- LLT solve with explicit context ----------------------------------------
+
+template <typename Scalar>
+void test_llt_solve_expr_context(Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = make_spd<Mat>(n);
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  GpuContext ctx;
+  DeviceMatrix<Scalar> d_X;
+  d_X.device(ctx) = d_A.llt().solve(d_B);
+
+  Mat X = d_X.toHost();
+  RealScalar residual = (A * X - B).norm() / B.norm();
+  VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
+}
+
+// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------
+
+template <typename Scalar>
+void test_lu_solve_expr(Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  DeviceMatrix<Scalar> d_X;
+  d_X = d_A.lu().solve(d_B);
+
+  Mat X = d_X.toHost();
+  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
+}
+
+// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------
+
+template <typename Scalar>
+void test_gemm_then_solve(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat D = Mat::Random(n, 1);
+
+  // Make SPD: C = A^H * A + n*I
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  DeviceMatrix<Scalar> d_C;
+  d_C = d_A.adjoint() * d_A;
+
+  // Add n*I on host (no element-wise ops on DeviceMatrix yet).
+  Mat C = d_C.toHost();
+  C += Mat::Identity(n, n) * static_cast<Scalar>(n);
+  d_C = DeviceMatrix<Scalar>::fromHost(C);
+
+  auto d_D = DeviceMatrix<Scalar>::fromHost(D);
+
+  DeviceMatrix<Scalar> d_X;
+  d_X = d_C.llt().solve(d_D);
+
+  Mat X = d_X.toHost();
+  RealScalar residual = (C * X - D).norm() / D.norm();
+  VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
+}
+
+// ---- LLT solve with Upper triangle -----------------------------------------
+
+template <typename Scalar>
+void test_llt_solve_upper(Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = make_spd<Mat>(n);
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  DeviceMatrix<Scalar> d_X;
+  d_X = d_A.template llt<Upper>().solve(d_B);
+
+  Mat X = d_X.toHost();
+  RealScalar residual = (A * X - B).norm() / B.norm();
+  VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
+}
+
+// ---- LU solve with explicit context -----------------------------------------
+
+template <typename Scalar>
+void test_lu_solve_expr_context(Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  GpuContext ctx;
+  DeviceMatrix<Scalar> d_X;
+  d_X.device(ctx) = d_A.lu().solve(d_B);
+
+  Mat X = d_X.toHost();
+  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
+}
+
+// ---- Zero-nrhs solver expressions ------------------------------------------
+
+template <typename Scalar>
+void test_llt_solve_zero_nrhs(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+
+  Mat A = make_spd<Mat>(n);
+  Mat B = Mat::Random(n, 0);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  DeviceMatrix<Scalar> d_X;
+  d_X = d_A.llt().solve(d_B);
+
+  VERIFY_IS_EQUAL(d_X.rows(), n);
+  VERIFY_IS_EQUAL(d_X.cols(), 0);
+}
+
+template <typename Scalar>
+void test_lu_solve_zero_nrhs(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, 0);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  DeviceMatrix<Scalar> d_X;
+  d_X = d_A.lu().solve(d_B);
+
+  VERIFY_IS_EQUAL(d_X.rows(), n);
+  VERIFY_IS_EQUAL(d_X.cols(), 0);
+}
+
+// ---- TRSM: triangularView<UpLo>().solve(B) ----------------------------------
+
+template <typename Scalar, int UpLo>
+void test_trsm(Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  // Build a well-conditioned triangular matrix.
+  Mat A = Mat::Random(n, n);
+  A.diagonal().array() += static_cast<Scalar>(n);  // ensure non-singular
+  if (UpLo == Lower)
+    A = A.template triangularView<Lower>();
+  else
+    A = A.template triangularView<Upper>();
+
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  DeviceMatrix<Scalar> d_X;
+  d_X = d_A.template triangularView<UpLo>().solve(d_B);
+
+  Mat X = d_X.toHost();
+  RealScalar residual = (A * X - B).norm() / B.norm();
+  VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
+}
+
+// ---- SYMM/HEMM: selfadjointView<UpLo>() * B --------------------------------
+
+template <typename Scalar, int UpLo>
+void test_symm(Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = make_spd<Mat>(n);  // SPD is also self-adjoint
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  DeviceMatrix<Scalar> d_C;
+  d_C = d_A.template selfadjointView<UpLo>() * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A * B;  // A is symmetric, so full multiply == symm
+
+  RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm());
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- SYRK/HERK: rankUpdate(A) → C = A * A^H --------------------------------
+
+template <typename Scalar>
+void test_syrk(Index n, Index k) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, k);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+
+  DeviceMatrix<Scalar> d_C;
+  d_C.template selfadjointView<Lower>().rankUpdate(d_A);
+
+  Mat C = d_C.toHost();
+  // Only lower triangle is meaningful for SYRK. Compare lower triangle.
+  Mat C_ref = A * A.adjoint();
+
+  // Extract lower triangle for comparison.
+  Mat C_lower = C.template triangularView<Lower>();
+  Mat C_ref_lower = C_ref.template triangularView<Lower>();
+
+  RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), A.norm());
+  VERIFY((C_lower - C_ref_lower).norm() < tol);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_gemm_basic<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_basic<Scalar>(128, 64, 32));
+  CALL_SUBTEST(test_gemm_basic<Scalar>(1, 1, 1));
+  CALL_SUBTEST(test_gemm_basic<Scalar>(256, 256, 256));
+
+  CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(128, 32, 64));
+
+  CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(128, 32, 64));
+
+  CALL_SUBTEST(test_gemm_scaled<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_scaled_rhs<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_accumulate<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_accumulate_empty<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_subtract<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_subtract_empty<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_dimension_mismatch<Scalar>());
+  CALL_SUBTEST(test_gemm_explicit_context<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_cross_context_reuse<Scalar>(64));
+  CALL_SUBTEST(test_gemm_cross_context_resize<Scalar>());
+  CALL_SUBTEST(test_gemm_chain<Scalar>(64));
+  CALL_SUBTEST(test_gemm_identity<Scalar>(64));
+
+  // Solver expressions — zero-size edge cases (use dedicated tests, not residual-based)
+
+  // Solver expressions
+  CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 1));
+  CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 4));
+  CALL_SUBTEST(test_llt_solve_expr<Scalar>(256, 8));
+  CALL_SUBTEST(test_llt_solve_expr_context<Scalar>(64, 4));
+  CALL_SUBTEST(test_llt_solve_upper<Scalar>(64, 4));
+  CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 1));
+  CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 4));
+  CALL_SUBTEST(test_lu_solve_expr<Scalar>(256, 8));
+  CALL_SUBTEST(test_lu_solve_expr_context<Scalar>(64, 4));
+  CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(64));
+  CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(0));
+  CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(64));
+  CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(0));
+  CALL_SUBTEST(test_gemm_then_solve<Scalar>(64));
+
+  // TRSM
+  CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 1)));
+  CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 4)));
+  CALL_SUBTEST((test_trsm<Scalar, Upper>(64, 4)));
+  CALL_SUBTEST((test_trsm<Scalar, Lower>(256, 8)));
+
+  // SYMM/HEMM
+  CALL_SUBTEST((test_symm<Scalar, Lower>(64, 4)));
+  CALL_SUBTEST((test_symm<Scalar, Upper>(64, 4)));
+  CALL_SUBTEST((test_symm<Scalar, Lower>(128, 8)));
+
+  // SYRK/HERK
+  CALL_SUBTEST(test_syrk<Scalar>(64, 64));
+  CALL_SUBTEST(test_syrk<Scalar>(64, 32));
+  CALL_SUBTEST(test_syrk<Scalar>(128, 64));
+}
+
+// ---- Solver failure mode tests (not templated on Scalar) --------------------
+
+void test_llt_not_spd() {
+  // Negative definite matrix — LLT factorization must fail.
+  MatrixXd A = -MatrixXd::Identity(8, 8);
+  MatrixXd B = MatrixXd::Random(8, 1);
+  auto d_A = DeviceMatrix<double>::fromHost(A);
+  auto d_B = DeviceMatrix<double>::fromHost(B);
+  DeviceMatrix<double> d_X;
+  VERIFY_RAISES_ASSERT(d_X = d_A.llt().solve(d_B));
+}
+
+void test_lu_singular() {
+  // Zero matrix — LU factorization must detect singularity.
+  MatrixXd A = MatrixXd::Zero(8, 8);
+  MatrixXd B = MatrixXd::Random(8, 1);
+  auto d_A = DeviceMatrix<double>::fromHost(A);
+  auto d_B = DeviceMatrix<double>::fromHost(B);
+  DeviceMatrix<double> d_X;
+  VERIFY_RAISES_ASSERT(d_X = d_A.lu().solve(d_B));
+}
+
+EIGEN_DECLARE_TEST(gpu_cublas) {
+  CALL_SUBTEST(test_scalar<float>());
+  CALL_SUBTEST(test_scalar<double>());
+  CALL_SUBTEST(test_scalar<std::complex<float>>());
+  CALL_SUBTEST(test_scalar<std::complex<double>>());
+  CALL_SUBTEST(test_llt_not_spd());
+  CALL_SUBTEST(test_lu_singular());
+}
--- a/test/gpu_cudss_ldlt.cpp
+++ b/test/gpu_cudss_ldlt.cpp
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for GpuSparseLDLT: GPU sparse LDL^T via cuDSS.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/Sparse>
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Helper: build a random sparse symmetric indefinite matrix ---------------
+
+template <typename Scalar>
+SparseMatrix<Scalar, ColMajor, int> make_symmetric_indefinite(Index n, double density = 0.1) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+
+  // Build a random sparse matrix and symmetrize it.
+  // The diagonal has mixed signs to ensure indefiniteness.
+  SpMat R(n, n);
+  R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
+  for (Index j = 0; j < n; ++j) {
+    for (Index i = 0; i < n; ++i) {
+      if (i == j || (std::rand() / double(RAND_MAX)) < density) {
+        R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
+      }
+    }
+  }
+  R.makeCompressed();
+
+  // A = R + R^H (symmetric), then add diagonal with alternating signs for indefiniteness.
+  SpMat A = R + SparseMatrix<Scalar, ColMajor, int>(R.adjoint());
+  for (Index i = 0; i < n; ++i) {
+    Scalar diag_val = Scalar((i % 2 == 0) ? n : -n);
+    A.coeffRef(i, i) += diag_val;
+  }
+  A.makeCompressed();
+  return A;
+}
+
+// ---- Solve and check residual -----------------------------------------------
+
+template <typename Scalar>
+void test_solve(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_symmetric_indefinite<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  GpuSparseLDLT<Scalar> ldlt(A);
+  VERIFY_IS_EQUAL(ldlt.info(), Success);
+
+  Vec x = ldlt.solve(b);
+  VERIFY_IS_EQUAL(x.rows(), n);
+
+  Vec r = A * x - b;
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(r.norm() / b.norm() < tol);
+}
+
+// ---- Multiple RHS -----------------------------------------------------------
+
+template <typename Scalar>
+void test_multiple_rhs(Index n, Index nrhs) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_symmetric_indefinite<Scalar>(n);
+  Mat B = Mat::Random(n, nrhs);
+
+  GpuSparseLDLT<Scalar> ldlt(A);
+  VERIFY_IS_EQUAL(ldlt.info(), Success);
+
+  Mat X = ldlt.solve(B);
+  VERIFY_IS_EQUAL(X.rows(), n);
+  VERIFY_IS_EQUAL(X.cols(), nrhs);
+
+  Mat R = A * X - B;
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(R.norm() / B.norm() < tol);
+}
+
+// ---- Refactorize ------------------------------------------------------------
+
+template <typename Scalar>
+void test_refactorize(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_symmetric_indefinite<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  GpuSparseLDLT<Scalar> ldlt;
+  ldlt.analyzePattern(A);
+  VERIFY_IS_EQUAL(ldlt.info(), Success);
+
+  ldlt.factorize(A);
+  VERIFY_IS_EQUAL(ldlt.info(), Success);
+  Vec x1 = ldlt.solve(b);
+
+  // Modify values, keep pattern.
+  SpMat A2 = A;
+  for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
+
+  ldlt.factorize(A2);
+  VERIFY_IS_EQUAL(ldlt.info(), Success);
+  Vec x2 = ldlt.solve(b);
+
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((A * x1 - b).norm() / b.norm() < tol);
+  VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
+  VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
+}
+
+// ---- Empty ------------------------------------------------------------------
+
+void test_empty() {
+  using SpMat = SparseMatrix<double, ColMajor, int>;
+  SpMat A(0, 0);
+  A.makeCompressed();
+  GpuSparseLDLT<double> ldlt(A);
+  VERIFY_IS_EQUAL(ldlt.info(), Success);
+  VERIFY_IS_EQUAL(ldlt.rows(), 0);
+  VERIFY_IS_EQUAL(ldlt.cols(), 0);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_solve<Scalar>(64));
+  CALL_SUBTEST(test_solve<Scalar>(256));
+  CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
+  CALL_SUBTEST(test_refactorize<Scalar>(64));
+}
+
+EIGEN_DECLARE_TEST(gpu_cudss_ldlt) {
+  CALL_SUBTEST(test_scalar<float>());
+  CALL_SUBTEST(test_scalar<double>());
+  CALL_SUBTEST(test_scalar<std::complex<float>>());
+  CALL_SUBTEST(test_scalar<std::complex<double>>());
+  CALL_SUBTEST(test_empty());
+}
--- a/test/gpu_cudss_llt.cpp
+++ b/test/gpu_cudss_llt.cpp
@@ -0,0 +1,202 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for GpuSparseLLT: GPU sparse Cholesky via cuDSS.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/Sparse>
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Helper: build a random sparse SPD matrix -------------------------------
+
+template <typename Scalar>
+SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  // Uses the global std::rand state seeded by the test framework (g_seed).
+  SpMat R(n, n);
+  R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
+  for (Index j = 0; j < n; ++j) {
+    for (Index i = 0; i < n; ++i) {
+      if (i == j || (std::rand() / double(RAND_MAX)) < density) {
+        R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
+      }
+    }
+  }
+  R.makeCompressed();
+
+  // A = R^H * R + n * I  (guaranteed SPD).
+  SpMat A = R.adjoint() * R;
+  for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
+  A.makeCompressed();
+  return A;
+}
+
+// ---- Solve and check residual -----------------------------------------------
+
+template <typename Scalar>
+void test_solve(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_spd<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  GpuSparseLLT<Scalar> llt(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+
+  Vec x = llt.solve(b);
+  VERIFY_IS_EQUAL(x.rows(), n);
+
+  // Check residual: ||Ax - b|| / ||b||.
+  Vec r = A * x - b;
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(r.norm() / b.norm() < tol);
+}
+
+// ---- Compare with CPU SimplicialLLT -----------------------------------------
+
+template <typename Scalar>
+void test_vs_cpu(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_spd<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  GpuSparseLLT<Scalar> gpu_llt(A);
+  VERIFY_IS_EQUAL(gpu_llt.info(), Success);
+  Vec x_gpu = gpu_llt.solve(b);
+
+  SimplicialLLT<SpMat> cpu_llt(A);
+  VERIFY_IS_EQUAL(cpu_llt.info(), Success);
+  Vec x_cpu = cpu_llt.solve(b);
+
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((x_gpu - x_cpu).norm() / x_cpu.norm() < tol);
+}
+
+// ---- Multiple RHS -----------------------------------------------------------
+
+template <typename Scalar>
+void test_multiple_rhs(Index n, Index nrhs) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_spd<Scalar>(n);
+  Mat B = Mat::Random(n, nrhs);
+
+  GpuSparseLLT<Scalar> llt(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+
+  Mat X = llt.solve(B);
+  VERIFY_IS_EQUAL(X.rows(), n);
+  VERIFY_IS_EQUAL(X.cols(), nrhs);
+
+  Mat R = A * X - B;
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(R.norm() / B.norm() < tol);
+}
+
+// ---- Separate analyze + factorize (refactorization) -------------------------
+
+template <typename Scalar>
+void test_refactorize(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_spd<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  GpuSparseLLT<Scalar> llt;
+  llt.analyzePattern(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+
+  // First factorize + solve.
+  llt.factorize(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+  Vec x1 = llt.solve(b);
+
+  // Modify values (keep same pattern): scale diagonal.
+  SpMat A2 = A;
+  for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
+
+  // Refactorize with same pattern.
+  llt.factorize(A2);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+  Vec x2 = llt.solve(b);
+
+  // Both solutions should satisfy their respective systems.
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((A * x1 - b).norm() / b.norm() < tol);
+  VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
+
+  // Solutions should differ (A2 != A).
+  VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
+}
+
+// ---- Empty matrix -----------------------------------------------------------
+
+void test_empty() {
+  using SpMat = SparseMatrix<double, ColMajor, int>;
+  SpMat A(0, 0);
+  A.makeCompressed();
+  GpuSparseLLT<double> llt(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+  VERIFY_IS_EQUAL(llt.rows(), 0);
+  VERIFY_IS_EQUAL(llt.cols(), 0);
+}
+
+// ---- Upper triangle ---------------------------------------------------------
+
+template <typename Scalar>
+void test_upper(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_spd<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  GpuSparseLLT<Scalar, Upper> llt(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+
+  Vec x = llt.solve(b);
+  Vec r = A * x - b;
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(r.norm() / b.norm() < tol);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_solve<Scalar>(64));
+  CALL_SUBTEST(test_solve<Scalar>(256));
+  CALL_SUBTEST(test_vs_cpu<Scalar>(64));
+  CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
+  CALL_SUBTEST(test_refactorize<Scalar>(64));
+  CALL_SUBTEST(test_upper<Scalar>(64));
+}
+
+EIGEN_DECLARE_TEST(gpu_cudss_llt) {
+  CALL_SUBTEST(test_scalar<float>());
+  CALL_SUBTEST(test_scalar<double>());
+  CALL_SUBTEST(test_scalar<std::complex<float>>());
+  CALL_SUBTEST(test_scalar<std::complex<double>>());
+  CALL_SUBTEST(test_empty());
+}
--- a/test/gpu_cudss_lu.cpp
+++ b/test/gpu_cudss_lu.cpp
@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for GpuSparseLU: GPU sparse LU via cuDSS.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/Sparse>
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Helper: build a random sparse non-singular general matrix ---------------
+
+template <typename Scalar>
+SparseMatrix<Scalar, ColMajor, int> make_general(Index n, double density = 0.1) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat R(n, n);
+  R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
+  for (Index j = 0; j < n; ++j) {
+    for (Index i = 0; i < n; ++i) {
+      if (i == j || (std::rand() / double(RAND_MAX)) < density) {
+        R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
+      }
+    }
+  }
+  // Add strong diagonal for non-singularity.
+  for (Index i = 0; i < n; ++i) R.coeffRef(i, i) += Scalar(RealScalar(n));
+  R.makeCompressed();
+  return R;
+}
+
+// ---- Solve and check residual -----------------------------------------------
+
+template <typename Scalar>
+void test_solve(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_general<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  GpuSparseLU<Scalar> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  Vec x = lu.solve(b);
+  VERIFY_IS_EQUAL(x.rows(), n);
+
+  Vec r = A * x - b;
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(r.norm() / b.norm() < tol);
+}
+
+// ---- Multiple RHS -----------------------------------------------------------
+
+template <typename Scalar>
+void test_multiple_rhs(Index n, Index nrhs) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_general<Scalar>(n);
+  Mat B = Mat::Random(n, nrhs);
+
+  GpuSparseLU<Scalar> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  Mat X = lu.solve(B);
+  VERIFY_IS_EQUAL(X.rows(), n);
+  VERIFY_IS_EQUAL(X.cols(), nrhs);
+
+  Mat R = A * X - B;
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(R.norm() / B.norm() < tol);
+}
+
+// ---- Refactorize ------------------------------------------------------------
+
+template <typename Scalar>
+void test_refactorize(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_general<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  GpuSparseLU<Scalar> lu;
+  lu.analyzePattern(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  lu.factorize(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+  Vec x1 = lu.solve(b);
+
+  // Modify values, keep pattern.
+  SpMat A2 = A;
+  for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
+
+  lu.factorize(A2);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+  Vec x2 = lu.solve(b);
+
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((A * x1 - b).norm() / b.norm() < tol);
+  VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
+  VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
+}
+
+// ---- Empty ------------------------------------------------------------------
+
+void test_empty() {
+  using SpMat = SparseMatrix<double, ColMajor, int>;
+  SpMat A(0, 0);
+  A.makeCompressed();
+  GpuSparseLU<double> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+  VERIFY_IS_EQUAL(lu.rows(), 0);
+  VERIFY_IS_EQUAL(lu.cols(), 0);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_solve<Scalar>(64));
+  CALL_SUBTEST(test_solve<Scalar>(256));
+  CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
+  CALL_SUBTEST(test_refactorize<Scalar>(64));
+}
+
+EIGEN_DECLARE_TEST(gpu_cudss_lu) {
+  CALL_SUBTEST(test_scalar<float>());
+  CALL_SUBTEST(test_scalar<double>());
+  CALL_SUBTEST(test_scalar<std::complex<float>>());
+  CALL_SUBTEST(test_scalar<std::complex<double>>());
+  CALL_SUBTEST(test_empty());
+}
--- a/test/gpu_cufft.cpp
+++ b/test/gpu_cufft.cpp
@@ -0,0 +1,186 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for GpuFFT: GPU FFT via cuFFT.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- 1D C2C roundtrip: inv(fwd(x)) ≈ x -------------------------------------
+
+template <typename Scalar>
+void test_c2c_roundtrip(Index n) {
+  using Complex = std::complex<Scalar>;
+  using Vec = Matrix<Complex, Dynamic, 1>;
+  using RealScalar = Scalar;
+
+  Vec x = Vec::Random(n);
+
+  GpuFFT<Scalar> fft;
+  Vec X = fft.fwd(x);
+  VERIFY_IS_EQUAL(X.size(), n);
+
+  Vec y = fft.inv(X);
+  VERIFY_IS_EQUAL(y.size(), n);
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((y - x).norm() / x.norm() < tol);
+}
+
+// ---- 1D C2C known signal: FFT of constant = delta --------------------------
+
+template <typename Scalar>
+void test_c2c_constant() {
+  using Complex = std::complex<Scalar>;
+  using Vec = Matrix<Complex, Dynamic, 1>;
+  using RealScalar = Scalar;
+
+  const int n = 64;
+  Vec x = Vec::Constant(n, Complex(3.0, 0.0));
+
+  GpuFFT<Scalar> fft;
+  Vec X = fft.fwd(x);
+
+  // FFT of constant c: X[0] = c*n, X[k] = 0 for k > 0.
+  RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(n);
+  VERIFY(std::abs(X(0) - Complex(3.0 * n, 0.0)) < tol);
+  for (int k = 1; k < n; ++k) {
+    VERIFY(std::abs(X(k)) < tol);
+  }
+}
+
+// ---- 1D R2C/C2R roundtrip: invReal(fwd(r), n) ≈ r --------------------------
+
+template <typename Scalar>
+void test_r2c_roundtrip(Index n) {
+  using Complex = std::complex<Scalar>;
+  using CVec = Matrix<Complex, Dynamic, 1>;
+  using RVec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = Scalar;
+
+  RVec r = RVec::Random(n);
+
+  GpuFFT<Scalar> fft;
+  CVec R = fft.fwd(r);
+
+  // R2C returns n/2+1 complex values.
+  VERIFY_IS_EQUAL(R.size(), n / 2 + 1);
+
+  RVec s = fft.invReal(R, n);
+  VERIFY_IS_EQUAL(s.size(), n);
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((s - r).norm() / r.norm() < tol);
+}
+
+// ---- 2D C2C roundtrip: inv2d(fwd2d(A)) ≈ A ---------------------------------
+
+template <typename Scalar>
+void test_2d_roundtrip(Index rows, Index cols) {
+  using Complex = std::complex<Scalar>;
+  using Mat = Matrix<Complex, Dynamic, Dynamic>;
+  using RealScalar = Scalar;
+
+  Mat A = Mat::Random(rows, cols);
+
+  GpuFFT<Scalar> fft;
+  Mat B = fft.fwd2d(A);
+  VERIFY_IS_EQUAL(B.rows(), rows);
+  VERIFY_IS_EQUAL(B.cols(), cols);
+
+  Mat C = fft.inv2d(B);
+  VERIFY_IS_EQUAL(C.rows(), rows);
+  VERIFY_IS_EQUAL(C.cols(), cols);
+
+  RealScalar tol = RealScalar(10) * RealScalar(rows * cols) * NumTraits<Scalar>::epsilon();
+  VERIFY((C - A).norm() / A.norm() < tol);
+}
+
+// ---- 2D C2C known signal: constant matrix -----------------------------------
+
+template <typename Scalar>
+void test_2d_constant() {
+  using Complex = std::complex<Scalar>;
+  using Mat = Matrix<Complex, Dynamic, Dynamic>;
+  using RealScalar = Scalar;
+
+  const int rows = 16, cols = 32;
+  Mat A = Mat::Constant(rows, cols, Complex(2.0, 0.0));
+
+  GpuFFT<Scalar> fft;
+  Mat B = fft.fwd2d(A);
+
+  // 2D FFT of constant c: B(0,0) = c*rows*cols, all others = 0.
+  RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(rows * cols);
+  VERIFY(std::abs(B(0, 0) - Complex(2.0 * rows * cols, 0.0)) < tol);
+  for (int j = 0; j < cols; ++j) {
+    for (int i = 0; i < rows; ++i) {
+      if (i == 0 && j == 0) continue;
+      VERIFY(std::abs(B(i, j)) < tol);
+    }
+  }
+}
+
+// ---- Plan reuse: repeated calls should work ---------------------------------
+
+template <typename Scalar>
+void test_plan_reuse() {
+  using Complex = std::complex<Scalar>;
+  using Vec = Matrix<Complex, Dynamic, 1>;
+  using RealScalar = Scalar;
+
+  GpuFFT<Scalar> fft;
+  for (int trial = 0; trial < 5; ++trial) {
+    Vec x = Vec::Random(128);
+    Vec X = fft.fwd(x);
+    Vec y = fft.inv(X);
+    RealScalar tol = RealScalar(10) * RealScalar(128) * NumTraits<Scalar>::epsilon();
+    VERIFY((y - x).norm() / x.norm() < tol);
+  }
+}
+
+// ---- Empty ------------------------------------------------------------------
+
+template <typename Scalar>
+void test_empty() {
+  using Complex = std::complex<Scalar>;
+  using Vec = Matrix<Complex, Dynamic, 1>;
+
+  GpuFFT<Scalar> fft;
+  Vec x(0);
+  Vec X = fft.fwd(x);
+  VERIFY_IS_EQUAL(X.size(), 0);
+  Vec y = fft.inv(X);
+  VERIFY_IS_EQUAL(y.size(), 0);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_c2c_roundtrip<Scalar>(64));
+  CALL_SUBTEST(test_c2c_roundtrip<Scalar>(256));
+  CALL_SUBTEST(test_c2c_roundtrip<Scalar>(1000));  // non-power-of-2
+  CALL_SUBTEST(test_c2c_constant<Scalar>());
+  CALL_SUBTEST(test_r2c_roundtrip<Scalar>(64));
+  CALL_SUBTEST(test_r2c_roundtrip<Scalar>(256));
+  CALL_SUBTEST(test_2d_roundtrip<Scalar>(32, 32));
+  CALL_SUBTEST(test_2d_roundtrip<Scalar>(16, 64));  // non-square
+  CALL_SUBTEST(test_2d_constant<Scalar>());
+  CALL_SUBTEST(test_plan_reuse<Scalar>());
+  CALL_SUBTEST(test_empty<Scalar>());
+}
+
+EIGEN_DECLARE_TEST(gpu_cufft) {
+  CALL_SUBTEST(test_scalar<float>());
+  CALL_SUBTEST(test_scalar<double>());
+}
--- a/test/gpu_cusolver_eigen.cpp
+++ b/test/gpu_cusolver_eigen.cpp
@@ -0,0 +1,180 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for GpuSelfAdjointEigenSolver: GPU symmetric/Hermitian eigenvalue
+// decomposition via cuSOLVER.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/Eigenvalues>
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Reconstruction: V * diag(W) * V^H ≈ A ---------------------------------
+
+template <typename Scalar>
+void test_eigen_reconstruction(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  // Build a symmetric/Hermitian matrix.
+  Mat R = Mat::Random(n, n);
+  Mat A = R + R.adjoint();
+
+  GpuSelfAdjointEigenSolver<Scalar> es(A);
+  VERIFY_IS_EQUAL(es.info(), Success);
+
+  auto W = es.eigenvalues();
+  Mat V = es.eigenvectors();
+
+  VERIFY_IS_EQUAL(W.size(), n);
+  VERIFY_IS_EQUAL(V.rows(), n);
+  VERIFY_IS_EQUAL(V.cols(), n);
+
+  // Reconstruct: A_hat = V * diag(W) * V^H.
+  Mat A_hat = V * W.asDiagonal() * V.adjoint();
+  RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
+  VERIFY((A_hat - A).norm() < tol);
+
+  // Orthogonality: V^H * V ≈ I.
+  Mat VhV = V.adjoint() * V;
+  Mat eye = Mat::Identity(n, n);
+  VERIFY((VhV - eye).norm() < tol);
+}
+
+// ---- Eigenvalues match CPU SelfAdjointEigenSolver ---------------------------
+
+template <typename Scalar>
+void test_eigen_values(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat R = Mat::Random(n, n);
+  Mat A = R + R.adjoint();
+
+  GpuSelfAdjointEigenSolver<Scalar> gpu_es(A);
+  VERIFY_IS_EQUAL(gpu_es.info(), Success);
+  auto W_gpu = gpu_es.eigenvalues();
+
+  SelfAdjointEigenSolver<Mat> cpu_es(A);
+  auto W_cpu = cpu_es.eigenvalues();
+
+  RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() *
+                   W_cpu.cwiseAbs().maxCoeff();
+  VERIFY((W_gpu - W_cpu).norm() < tol);
+}
+
+// ---- Eigenvalues-only mode --------------------------------------------------
+
+template <typename Scalar>
+void test_eigen_values_only(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat R = Mat::Random(n, n);
+  Mat A = R + R.adjoint();
+
+  GpuSelfAdjointEigenSolver<Scalar> gpu_es(A, GpuSelfAdjointEigenSolver<Scalar>::EigenvaluesOnly);
+  VERIFY_IS_EQUAL(gpu_es.info(), Success);
+  auto W_gpu = gpu_es.eigenvalues();
+
+  SelfAdjointEigenSolver<Mat> cpu_es(A, EigenvaluesOnly);
+  auto W_cpu = cpu_es.eigenvalues();
+
+  RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() *
+                   W_cpu.cwiseAbs().maxCoeff();
+  VERIFY((W_gpu - W_cpu).norm() < tol);
+}
+
+// ---- DeviceMatrix input path ------------------------------------------------
+
+template <typename Scalar>
+void test_eigen_device_matrix(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat R = Mat::Random(n, n);
+  Mat A = R + R.adjoint();
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  GpuSelfAdjointEigenSolver<Scalar> es;
+  es.compute(d_A);
+  VERIFY_IS_EQUAL(es.info(), Success);
+
+  auto W_gpu = es.eigenvalues();
+  Mat V = es.eigenvectors();
+
+  // Verify reconstruction.
+  Mat A_hat = V * W_gpu.asDiagonal() * V.adjoint();
+  RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
+  VERIFY((A_hat - A).norm() < tol);
+}
+
+// ---- Recompute (reuse solver object) ----------------------------------------
+
+template <typename Scalar>
+void test_eigen_recompute(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  GpuSelfAdjointEigenSolver<Scalar> es;
+
+  for (int trial = 0; trial < 3; ++trial) {
+    Mat R = Mat::Random(n, n);
+    Mat A = R + R.adjoint();
+    es.compute(A);
+    VERIFY_IS_EQUAL(es.info(), Success);
+
+    auto W = es.eigenvalues();
+    Mat V = es.eigenvectors();
+    Mat A_hat = V * W.asDiagonal() * V.adjoint();
+    RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
+    VERIFY((A_hat - A).norm() < tol);
+  }
+}
+
+// ---- Empty matrix -----------------------------------------------------------
+
+void test_eigen_empty() {
+  GpuSelfAdjointEigenSolver<double> es(MatrixXd(0, 0));
+  VERIFY_IS_EQUAL(es.info(), Success);
+  VERIFY_IS_EQUAL(es.rows(), 0);
+  VERIFY_IS_EQUAL(es.cols(), 0);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  // Reconstruction + orthogonality.
+  CALL_SUBTEST(test_eigen_reconstruction<Scalar>(64));
+  CALL_SUBTEST(test_eigen_reconstruction<Scalar>(128));
+
+  // Eigenvalues match CPU.
+  CALL_SUBTEST(test_eigen_values<Scalar>(64));
+  CALL_SUBTEST(test_eigen_values<Scalar>(128));
+
+  // Values-only mode.
+  CALL_SUBTEST(test_eigen_values_only<Scalar>(64));
+
+  // DeviceMatrix input.
+  CALL_SUBTEST(test_eigen_device_matrix<Scalar>(64));
+
+  // Recompute.
+  CALL_SUBTEST(test_eigen_recompute<Scalar>(32));
+}
+
+EIGEN_DECLARE_TEST(gpu_cusolver_eigen) {
+  CALL_SUBTEST(test_scalar<float>());
+  CALL_SUBTEST(test_scalar<double>());
+  CALL_SUBTEST(test_scalar<std::complex<float>>());
+  CALL_SUBTEST(test_scalar<std::complex<double>>());
+  CALL_SUBTEST(test_eigen_empty());
+}
--- a/test/gpu_cusolver_llt.cpp
+++ b/test/gpu_cusolver_llt.cpp
@@ -0,0 +1,210 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for GpuLLT: GPU Cholesky (LL^T) using cuSOLVER.
+// Covers cusolverDnXpotrf (factorization) and cusolverDnXpotrs (solve)
+// for float, double, complex<float>, complex<double>, Lower and Upper.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/Cholesky>
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+// Build a random symmetric positive-definite matrix: A = M^H*M + n*I.
+template <typename MatrixType>
+MatrixType make_spd(Index n) {
+  using Scalar = typename MatrixType::Scalar;
+  MatrixType M = MatrixType::Random(n, n);
+  return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
+}
+
+// Test factorization: L*L^H must reconstruct A to within floating-point tolerance.
+template <typename Scalar, int UpLo>
+void test_potrf(Index n) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = make_spd<MatrixType>(n);
+
+  GpuLLT<Scalar, UpLo> llt(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+
+  // Reconstruct L*L^H and compare to original A.
+  // GpuLLT stores the factor on device; use CPU LLT to get the triangular factor
+  // for reconstruction since GpuLLT does not expose the device-resident factor directly.
+  LLT<MatrixType, UpLo> ref(A);
+  VERIFY_IS_EQUAL(ref.info(), Success);
+  MatrixType A_reconstructed = ref.reconstructedMatrix();
+
+  // Both should equal A to within n*eps*||A||.
+  RealScalar tol = RealScalar(4) * RealScalar(n) * NumTraits<Scalar>::epsilon() * A.norm();
+  VERIFY((A_reconstructed - A).norm() < tol);
+
+  // Smoke-test: llt.solve(b) should return the same result as ref.solve(b).
+  MatrixType b = MatrixType::Random(n, 1);
+  MatrixType x_gpu = llt.solve(b);
+  MatrixType x_cpu = ref.solve(b);
+  VERIFY((x_gpu - x_cpu).norm() < tol);
+}
+
+// Test solve: residual ||A*X - B|| / ||B|| must be small.
+template <typename Scalar, int UpLo>
+void test_potrs(Index n, Index nrhs) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = make_spd<MatrixType>(n);
+  MatrixType B = MatrixType::Random(n, nrhs);
+
+  GpuLLT<Scalar, UpLo> llt(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+
+  MatrixType X = llt.solve(B);
+
+  RealScalar residual = (A * X - B).norm() / B.norm();
+  RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(residual < tol);
+}
+
+// Test that multiple solves against the same factor all produce correct results.
+// This exercises the key design property: L stays on device across calls.
+template <typename Scalar>
+void test_multiple_solves(Index n) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = make_spd<MatrixType>(n);
+  GpuLLT<Scalar, Lower> llt(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+
+  RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon();
+  for (int k = 0; k < 5; ++k) {
+    MatrixType B = MatrixType::Random(n, 3);
+    MatrixType X = llt.solve(B);
+    RealScalar residual = (A * X - B).norm() / B.norm();
+    VERIFY(residual < tol);
+  }
+}
+
+// Test that GpuLLT correctly detects a non-SPD matrix.
+void test_not_spd() {
+  MatrixXd A = -MatrixXd::Identity(8, 8);  // negative definite
+  GpuLLT<double> llt(A);
+  VERIFY_IS_EQUAL(llt.info(), NumericalIssue);
+}
+
+// ---- DeviceMatrix integration tests -----------------------------------------
+
+// compute(DeviceMatrix) + solve(DeviceMatrix) → toHost
+template <typename Scalar, int UpLo>
+void test_device_matrix_solve(Index n, Index nrhs) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = make_spd<MatrixType>(n);
+  MatrixType B = MatrixType::Random(n, nrhs);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  GpuLLT<Scalar, UpLo> llt;
+  llt.compute(d_A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+
+  DeviceMatrix<Scalar> d_X = llt.solve(d_B);
+  MatrixType X = d_X.toHost();
+
+  RealScalar residual = (A * X - B).norm() / B.norm();
+  VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// compute(DeviceMatrix&&) — move path
+template <typename Scalar>
+void test_device_matrix_move_compute(Index n) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = make_spd<MatrixType>(n);
+  MatrixType B = MatrixType::Random(n, 1);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  GpuLLT<Scalar, Lower> llt;
+  llt.compute(std::move(d_A));
+  VERIFY_IS_EQUAL(llt.info(), Success);
+
+  // d_A should be empty after move.
+  VERIFY(d_A.empty());
+
+  MatrixType X = llt.solve(B);
+  RealScalar residual = (A * X - B).norm() / B.norm();
+  VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// Full async chain: compute → solve → solve again with result as RHS → toHost
+template <typename Scalar>
+void test_chaining(Index n) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = make_spd<MatrixType>(n);
+  MatrixType B = MatrixType::Random(n, 3);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  GpuLLT<Scalar, Lower> llt;
+  llt.compute(d_A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+
+  // Chain: solve → use result as RHS for another solve
+  DeviceMatrix<Scalar> d_X = llt.solve(d_B);
+  DeviceMatrix<Scalar> d_Y = llt.solve(d_X);
+
+  // Only sync at the very end.
+  MatrixType Y = d_Y.toHost();
+
+  // Verify: Y = A^{-2} * B
+  MatrixType X_ref = LLT<MatrixType, Lower>(A).solve(B);
+  MatrixType Y_ref = LLT<MatrixType, Lower>(A).solve(X_ref);
+
+  RealScalar tol = RealScalar(4) * RealScalar(n) * NumTraits<Scalar>::epsilon() * Y_ref.norm();
+  VERIFY((Y - Y_ref).norm() < tol);
+}
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST((test_potrf<Scalar, Lower>(1)));
+  CALL_SUBTEST((test_potrf<Scalar, Lower>(64)));
+  CALL_SUBTEST((test_potrf<Scalar, Lower>(256)));
+  CALL_SUBTEST((test_potrf<Scalar, Upper>(64)));
+  CALL_SUBTEST((test_potrf<Scalar, Upper>(256)));
+
+  CALL_SUBTEST((test_potrs<Scalar, Lower>(64, 1)));
+  CALL_SUBTEST((test_potrs<Scalar, Lower>(64, 4)));
+  CALL_SUBTEST((test_potrs<Scalar, Lower>(256, 8)));
+  CALL_SUBTEST((test_potrs<Scalar, Upper>(64, 1)));
+  CALL_SUBTEST((test_potrs<Scalar, Upper>(256, 4)));
+
+  CALL_SUBTEST(test_multiple_solves<Scalar>(128));
+
+  CALL_SUBTEST((test_device_matrix_solve<Scalar, Lower>(64, 4)));
+  CALL_SUBTEST((test_device_matrix_solve<Scalar, Upper>(128, 1)));
+  CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));
+  CALL_SUBTEST(test_chaining<Scalar>(64));
+}
+
+EIGEN_DECLARE_TEST(gpu_cusolver_llt) {
+  CALL_SUBTEST(test_scalar<float>());
+  CALL_SUBTEST(test_scalar<double>());
+  CALL_SUBTEST(test_scalar<std::complex<float>>());
+  CALL_SUBTEST(test_scalar<std::complex<double>>());
+  CALL_SUBTEST(test_not_spd());
+}
--- a/test/gpu_cusolver_lu.cpp
+++ b/test/gpu_cusolver_lu.cpp
@@ -0,0 +1,206 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for GpuLU: GPU partial-pivoting LU decomposition via cuSOLVER.
+// Covers cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve)
+// for float, double, complex<float>, complex<double>.
+//
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/LU>
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Test factorization + NoTrans solve: residual ||A*X - B|| / ||B|| -------
+
+template <typename Scalar>
+void test_getrf(Index n) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = MatrixType::Random(n, n);
+  MatrixType B = MatrixType::Random(n, 4);
+
+  GpuLU<Scalar> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  MatrixType X = lu.solve(B);
+  // Backward error bound for LU: ||A*X - B|| <= O(n*u) * ||A|| * ||X||.
+  // Normalize by ||A||*||X|| rather than ||B|| to be condition-number agnostic.
+  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// ---- Test solve: A^T*X = B and A^H*X = B ------------------------------------
+
+template <typename Scalar>
+void test_getrs_trans(Index n) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = MatrixType::Random(n, n);
+  MatrixType B = MatrixType::Random(n, 3);
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+
+  GpuLU<Scalar> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  MatrixType Xt = lu.solve(B, GpuLU<Scalar>::Transpose);
+  VERIFY((A.transpose() * Xt - B).norm() / (A.norm() * Xt.norm()) < tol);
+
+  MatrixType Xc = lu.solve(B, GpuLU<Scalar>::ConjugateTranspose);
+  VERIFY((A.adjoint() * Xc - B).norm() / (A.norm() * Xc.norm()) < tol);
+}
+
+// ---- Test multiple solves reuse the device-resident LU ----------------------
+
+template <typename Scalar>
+void test_multiple_solves(Index n) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = MatrixType::Random(n, n);
+  GpuLU<Scalar> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  for (int k = 0; k < 5; ++k) {
+    MatrixType B = MatrixType::Random(n, 3);
+    MatrixType X = lu.solve(B);
+    VERIFY((A * X - B).norm() / (A.norm() * X.norm()) < tol);
+  }
+}
+
+// ---- Agreement with CPU PartialPivLU ----------------------------------------
+
+template <typename Scalar>
+void test_vs_cpu(Index n) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = MatrixType::Random(n, n);
+  MatrixType B = MatrixType::Random(n, 5);
+
+  GpuLU<Scalar> gpu_lu(A);
+  VERIFY_IS_EQUAL(gpu_lu.info(), Success);
+
+  MatrixType X_gpu = gpu_lu.solve(B);
+  MatrixType X_cpu = PartialPivLU<MatrixType>(A).solve(B);
+
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);
+}
+
+// ---- Singular matrix detection ----------------------------------------------
+
+void test_singular() {
+  MatrixXd A = MatrixXd::Zero(8, 8);
+  GpuLU<double> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), NumericalIssue);
+}
+
+// ---- DeviceMatrix integration tests -----------------------------------------
+
+template <typename Scalar>
+void test_device_matrix_solve(Index n) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = MatrixType::Random(n, n);
+  MatrixType B = MatrixType::Random(n, 4);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  GpuLU<Scalar> lu;
+  lu.compute(d_A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  DeviceMatrix<Scalar> d_X = lu.solve(d_B);
+  MatrixType X = d_X.toHost();
+
+  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+template <typename Scalar>
+void test_device_matrix_move_compute(Index n) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = MatrixType::Random(n, n);
+  MatrixType B = MatrixType::Random(n, 1);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  GpuLU<Scalar> lu;
+  lu.compute(std::move(d_A));
+  VERIFY_IS_EQUAL(lu.info(), Success);
+  VERIFY(d_A.empty());
+
+  MatrixType X = lu.solve(B);
+  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+template <typename Scalar>
+void test_chaining(Index n) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = MatrixType::Random(n, n);
+  MatrixType B = MatrixType::Random(n, 3);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  GpuLU<Scalar> lu;
+  lu.compute(d_A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  // Chain: solve → use result as RHS
+  DeviceMatrix<Scalar> d_X = lu.solve(d_B);
+  DeviceMatrix<Scalar> d_Y = lu.solve(d_X);
+  MatrixType Y = d_Y.toHost();
+
+  MatrixType X_ref = PartialPivLU<MatrixType>(A).solve(B);
+  MatrixType Y_ref = PartialPivLU<MatrixType>(A).solve(X_ref);
+
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon() * Y_ref.norm();
+  VERIFY((Y - Y_ref).norm() < tol);
+}
+
+// ---- Per-scalar driver -------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_getrf<Scalar>(1));
+  CALL_SUBTEST(test_getrf<Scalar>(64));
+  CALL_SUBTEST(test_getrf<Scalar>(256));
+
+  CALL_SUBTEST(test_getrs_trans<Scalar>(64));
+  CALL_SUBTEST(test_getrs_trans<Scalar>(128));
+
+  CALL_SUBTEST(test_multiple_solves<Scalar>(128));
+
+  CALL_SUBTEST(test_vs_cpu<Scalar>(64));
+  CALL_SUBTEST(test_vs_cpu<Scalar>(256));
+
+  CALL_SUBTEST(test_device_matrix_solve<Scalar>(64));
+  CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));
+  CALL_SUBTEST(test_chaining<Scalar>(64));
+}
+
+EIGEN_DECLARE_TEST(gpu_cusolver_lu) {
+  CALL_SUBTEST(test_scalar<float>());
+  CALL_SUBTEST(test_scalar<double>());
+  CALL_SUBTEST(test_scalar<std::complex<float>>());
+  CALL_SUBTEST(test_scalar<std::complex<double>>());
+  CALL_SUBTEST(test_singular());
+}
--- a/test/gpu_cusolver_qr.cpp
+++ b/test/gpu_cusolver_qr.cpp
@@ -0,0 +1,185 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for GpuQR: GPU QR decomposition via cuSOLVER.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/QR>
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Solve square system: A * X = B -----------------------------------------
+
+template <typename Scalar>
+void test_qr_solve_square(Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, nrhs);
+
+  GpuQR<Scalar> qr(A);
+  VERIFY_IS_EQUAL(qr.info(), Success);
+
+  Mat X = qr.solve(B);
+  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// ---- Solve overdetermined system: m > n (least-squares) ---------------------
+
+template <typename Scalar>
+void test_qr_solve_overdetermined(Index m, Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  eigen_assert(m >= n);
+  Mat A = Mat::Random(m, n);
+  Mat B = Mat::Random(m, nrhs);
+
+  GpuQR<Scalar> qr(A);
+  VERIFY_IS_EQUAL(qr.info(), Success);
+
+  Mat X = qr.solve(B);
+  VERIFY_IS_EQUAL(X.rows(), n);
+  VERIFY_IS_EQUAL(X.cols(), nrhs);
+
+  // Compare with CPU QR.
+  Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
+  RealScalar tol = RealScalar(100) * RealScalar(m) * NumTraits<Scalar>::epsilon();
+  VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
+}
+
+// ---- Solve with DeviceMatrix input ------------------------------------------
+
+template <typename Scalar>
+void test_qr_solve_device(Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  GpuQR<Scalar> qr;
+  qr.compute(d_A);
+  VERIFY_IS_EQUAL(qr.info(), Success);
+
+  DeviceMatrix<Scalar> d_X = qr.solve(d_B);
+  Mat X = d_X.toHost();
+
+  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// ---- Solve overdetermined via device path -----------------------------------
+
+template <typename Scalar>
+void test_qr_solve_overdetermined_device(Index m, Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  eigen_assert(m >= n);
+  Mat A = Mat::Random(m, n);
+  Mat B = Mat::Random(m, nrhs);
+
+  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
+
+  GpuQR<Scalar> qr;
+  qr.compute(d_A);
+  VERIFY_IS_EQUAL(qr.info(), Success);
+
+  DeviceMatrix<Scalar> d_X = qr.solve(d_B);
+  VERIFY_IS_EQUAL(d_X.rows(), n);
+  VERIFY_IS_EQUAL(d_X.cols(), nrhs);
+
+  Mat X = d_X.toHost();
+  Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
+  RealScalar tol = RealScalar(100) * RealScalar(m) * NumTraits<Scalar>::epsilon();
+  VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
+}
+
+// ---- Multiple solves reuse the factorization --------------------------------
+
+template <typename Scalar>
+void test_qr_multiple_solves(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  GpuQR<Scalar> qr(A);
+  VERIFY_IS_EQUAL(qr.info(), Success);
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  for (int k = 0; k < 5; ++k) {
+    Mat B = Mat::Random(n, 3);
+    Mat X = qr.solve(B);
+    RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+    VERIFY(residual < tol);
+  }
+}
+
+// ---- Agreement with CPU HouseholderQR ---------------------------------------
+
+template <typename Scalar>
+void test_qr_vs_cpu(Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, nrhs);
+
+  GpuQR<Scalar> gpu_qr(A);
+  VERIFY_IS_EQUAL(gpu_qr.info(), Success);
+
+  Mat X_gpu = gpu_qr.solve(B);
+  Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
+
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_qr_solve_square<Scalar>(1, 1));
+  CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 1));
+  CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 4));
+  CALL_SUBTEST(test_qr_solve_square<Scalar>(256, 8));
+
+  CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(128, 64, 4));
+  CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(256, 128, 1));
+
+  CALL_SUBTEST(test_qr_solve_device<Scalar>(64, 4));
+  CALL_SUBTEST(test_qr_solve_overdetermined_device<Scalar>(128, 64, 4));
+  CALL_SUBTEST(test_qr_multiple_solves<Scalar>(64));
+  CALL_SUBTEST(test_qr_vs_cpu<Scalar>(64, 4));
+  CALL_SUBTEST(test_qr_vs_cpu<Scalar>(256, 8));
+}
+
+void test_qr_empty() {
+  GpuQR<double> qr(MatrixXd(0, 0));
+  VERIFY_IS_EQUAL(qr.info(), Success);
+  VERIFY_IS_EQUAL(qr.rows(), 0);
+  VERIFY_IS_EQUAL(qr.cols(), 0);
+}
+
+EIGEN_DECLARE_TEST(gpu_cusolver_qr) {
+  CALL_SUBTEST(test_scalar<float>());
+  CALL_SUBTEST(test_scalar<double>());
+  CALL_SUBTEST(test_scalar<std::complex<float>>());
+  CALL_SUBTEST(test_scalar<std::complex<double>>());
+  CALL_SUBTEST(test_qr_empty());
+}
--- a/test/gpu_cusolver_svd.cpp
+++ b/test/gpu_cusolver_svd.cpp
@@ -0,0 +1,194 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for GpuSVD: GPU SVD via cuSOLVER.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/SVD>
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- SVD reconstruction: U * diag(S) * VT ≈ A ------------------------------
+
+template <typename Scalar, unsigned int Options>
+void test_svd_reconstruction(Index m, Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, n);
+  GpuSVD<Scalar> svd(A, Options);
+  VERIFY_IS_EQUAL(svd.info(), Success);
+
+  auto S = svd.singularValues();
+  Mat U = svd.matrixU();
+  Mat VT = svd.matrixVT();
+
+  const Index k = (std::min)(m, n);
+
+  // Reconstruct: A_hat = U[:,:k] * diag(S) * VT[:k,:].
+  Mat A_hat = U.leftCols(k) * S.asDiagonal() * VT.topRows(k);
+  RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(k)) * NumTraits<Scalar>::epsilon() * A.norm();
+  VERIFY((A_hat - A).norm() < tol);
+
+  // Orthogonality: U^H * U ≈ I.
+  Mat UtU = U.adjoint() * U;
+  Mat I_u = Mat::Identity(U.cols(), U.cols());
+  VERIFY((UtU - I_u).norm() < tol);
+
+  // Orthogonality: VT * VT^H ≈ I.
+  Mat VtVh = VT * VT.adjoint();
+  Mat I_v = Mat::Identity(VT.rows(), VT.rows());
+  VERIFY((VtVh - I_v).norm() < tol);
+}
+
+// ---- Singular values match CPU BDCSVD ---------------------------------------
+
+template <typename Scalar>
+void test_svd_singular_values(Index m, Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, n);
+  GpuSVD<Scalar> svd(A, 0);  // values only
+  VERIFY_IS_EQUAL(svd.info(), Success);
+
+  auto S_gpu = svd.singularValues();
+  auto S_cpu = BDCSVD<Mat>(A, 0).singularValues();
+
+  RealScalar tol =
+      RealScalar(5) * std::sqrt(static_cast<RealScalar>((std::min)(m, n))) * NumTraits<Scalar>::epsilon() * S_cpu(0);
+  VERIFY((S_gpu - S_cpu).norm() < tol);
+}
+
+// ---- Solve: pseudoinverse ---------------------------------------------------
+
+template <typename Scalar>
+void test_svd_solve(Index m, Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, n);
+  Mat B = Mat::Random(m, nrhs);
+
+  GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
+  VERIFY_IS_EQUAL(svd.info(), Success);
+
+  Mat X = svd.solve(B);
+  VERIFY_IS_EQUAL(X.rows(), n);
+  VERIFY_IS_EQUAL(X.cols(), nrhs);
+
+  // Compare with CPU BDCSVD solve.
+  Mat X_cpu = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV).solve(B);
+  RealScalar tol = RealScalar(100) * RealScalar((std::max)(m, n)) * NumTraits<Scalar>::epsilon();
+  VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
+}
+
+// ---- Solve: truncated -------------------------------------------------------
+
+template <typename Scalar>
+void test_svd_solve_truncated(Index m, Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, n);
+  Mat B = Mat::Random(m, 1);
+  const Index k = (std::min)(m, n);
+  const Index trunc = k / 2;
+  eigen_assert(trunc > 0);
+
+  GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
+  Mat X_trunc = svd.solve(B, trunc);
+
+  // Build CPU reference: truncated pseudoinverse.
+  auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
+  auto S = cpu_svd.singularValues();
+  Mat U = cpu_svd.matrixU();
+  Mat V = cpu_svd.matrixV();
+
+  // D_ii = 1/S_i for i < trunc, 0 otherwise.
+  Matrix<RealScalar, Dynamic, 1> D = Matrix<RealScalar, Dynamic, 1>::Zero(k);
+  for (Index i = 0; i < trunc; ++i) D(i) = RealScalar(1) / S(i);
+  Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
+
+  RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
+  VERIFY((X_trunc - X_ref).norm() / X_ref.norm() < tol);
+}
+
+// ---- Solve: Tikhonov regularized --------------------------------------------
+
+template <typename Scalar>
+void test_svd_solve_regularized(Index m, Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, n);
+  Mat B = Mat::Random(m, 1);
+  RealScalar lambda = RealScalar(0.1);
+  const Index k = (std::min)(m, n);
+
+  GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
+  Mat X_reg = svd.solve(B, lambda);
+
+  // CPU reference: D_ii = S_i / (S_i^2 + lambda^2).
+  auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
+  auto S = cpu_svd.singularValues();
+  Mat U = cpu_svd.matrixU();
+  Mat V = cpu_svd.matrixV();
+
+  Matrix<RealScalar, Dynamic, 1> D(k);
+  for (Index i = 0; i < k; ++i) D(i) = S(i) / (S(i) * S(i) + lambda * lambda);
+  Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
+
+  RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
+  VERIFY((X_reg - X_ref).norm() / X_ref.norm() < tol);
+}
+
+// ---- Empty matrix -----------------------------------------------------------
+
+void test_svd_empty() {
+  GpuSVD<double> svd(MatrixXd(0, 0), 0);
+  VERIFY_IS_EQUAL(svd.info(), Success);
+  VERIFY_IS_EQUAL(svd.rows(), 0);
+  VERIFY_IS_EQUAL(svd.cols(), 0);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  // Reconstruction + orthogonality (thin and full, identical test logic).
+  CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 64)));
+  CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(128, 64)));
+  CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 128)));  // wide (m < n)
+  CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(64, 64)));
+  CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(128, 64)));
+
+  // Singular values.
+  CALL_SUBTEST(test_svd_singular_values<Scalar>(64, 64));
+  CALL_SUBTEST(test_svd_singular_values<Scalar>(128, 64));
+
+  // Solve.
+  CALL_SUBTEST(test_svd_solve<Scalar>(64, 64, 4));
+  CALL_SUBTEST(test_svd_solve<Scalar>(128, 64, 4));
+  CALL_SUBTEST(test_svd_solve<Scalar>(64, 128, 4));  // wide (m < n)
+
+  // Truncated and regularized solve.
+  CALL_SUBTEST(test_svd_solve_truncated<Scalar>(64, 64));
+  CALL_SUBTEST(test_svd_solve_regularized<Scalar>(64, 64));
+}
+
+EIGEN_DECLARE_TEST(gpu_cusolver_svd) {
+  CALL_SUBTEST(test_scalar<float>());
+  CALL_SUBTEST(test_scalar<double>());
+  CALL_SUBTEST(test_scalar<std::complex<float>>());
+  CALL_SUBTEST(test_scalar<std::complex<double>>());
+  CALL_SUBTEST(test_svd_empty());
+}
--- a/test/gpu_cusparse_spmv.cpp
+++ b/test/gpu_cusparse_spmv.cpp
@@ -0,0 +1,203 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for GpuSparseContext: GPU SpMV/SpMM via cuSPARSE.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/Sparse>
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Helper: build a random sparse matrix -----------------------------------
+
+template <typename Scalar>
+SparseMatrix<Scalar, ColMajor, int> make_sparse(Index rows, Index cols, double density = 0.1) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat R(rows, cols);
+  R.reserve(VectorXi::Constant(cols, static_cast<int>(rows * density) + 1));
+  for (Index j = 0; j < cols; ++j) {
+    for (Index i = 0; i < rows; ++i) {
+      if ((std::rand() / double(RAND_MAX)) < density) {
+        R.insert(i, j) = Scalar(RealScalar(std::rand() / double(RAND_MAX) - 0.5));
+      }
+    }
+  }
+  R.makeCompressed();
+  return R;
+}
+
+// ---- SpMV: y = A * x -------------------------------------------------------
+
+template <typename Scalar>
+void test_spmv(Index rows, Index cols) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_sparse<Scalar>(rows, cols);
+  Vec x = Vec::Random(cols);
+
+  GpuSparseContext<Scalar> ctx;
+  Vec y_gpu = ctx.multiply(A, x);
+  Vec y_cpu = A * x;
+
+  RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
+  VERIFY_IS_EQUAL(y_gpu.size(), rows);
+  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+}
+
+// ---- SpMV with alpha/beta: y = alpha*A*x + beta*y ---------------------------
+
+template <typename Scalar>
+void test_spmv_alpha_beta(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_sparse<Scalar>(n, n);
+  Vec x = Vec::Random(n);
+  Vec y_init = Vec::Random(n);
+
+  Scalar alpha(2);
+  Scalar beta(3);
+
+  Vec y_cpu = alpha * (A * x) + beta * y_init;
+
+  GpuSparseContext<Scalar> ctx;
+  Vec y_gpu = y_init;
+  ctx.multiply(A, x, y_gpu, alpha, beta);
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+}
+
+// ---- Transpose: y = A^T * x ------------------------------------------------
+
+template <typename Scalar>
+void test_spmv_transpose(Index rows, Index cols) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_sparse<Scalar>(rows, cols);
+  Vec x = Vec::Random(rows);
+
+  GpuSparseContext<Scalar> ctx;
+  Vec y_gpu = ctx.multiplyT(A, x);
+  Vec y_cpu = A.transpose() * x;
+
+  RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
+  VERIFY_IS_EQUAL(y_gpu.size(), cols);
+  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+}
+
+// ---- SpMM: Y = A * X (multiple RHS) ----------------------------------------
+
+template <typename Scalar>
+void test_spmm(Index rows, Index cols, Index nrhs) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_sparse<Scalar>(rows, cols);
+  Mat X = Mat::Random(cols, nrhs);
+
+  GpuSparseContext<Scalar> ctx;
+  Mat Y_gpu = ctx.multiplyMat(A, X);
+  Mat Y_cpu = A * X;
+
+  RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
+  VERIFY_IS_EQUAL(Y_gpu.rows(), rows);
+  VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);
+  VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);
+}
+
+// ---- Identity matrix: I * x = x --------------------------------------------
+
+template <typename Scalar>
+void test_identity(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  // Build sparse identity.
+  SpMat eye(n, n);
+  eye.setIdentity();
+  eye.makeCompressed();
+
+  Vec x = Vec::Random(n);
+
+  GpuSparseContext<Scalar> ctx;
+  Vec y = ctx.multiply(eye, x);
+
+  RealScalar tol = NumTraits<Scalar>::epsilon();
+  VERIFY((y - x).norm() < tol);
+}
+
+// ---- Context reuse ----------------------------------------------------------
+
+template <typename Scalar>
+void test_reuse(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  GpuSparseContext<Scalar> ctx;
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+
+  for (int trial = 0; trial < 3; ++trial) {
+    SpMat A = make_sparse<Scalar>(n, n);
+    Vec x = Vec::Random(n);
+    Vec y_gpu = ctx.multiply(A, x);
+    Vec y_cpu = A * x;
+    VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+  }
+}
+
+// ---- Empty ------------------------------------------------------------------
+
+template <typename Scalar>
+void test_empty() {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+
+  SpMat A(0, 0);
+  A.makeCompressed();
+  Vec x(0);
+
+  GpuSparseContext<Scalar> ctx;
+  Vec y = ctx.multiply(A, x);
+  VERIFY_IS_EQUAL(y.size(), 0);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_spmv<Scalar>(64, 64));
+  CALL_SUBTEST(test_spmv<Scalar>(128, 64));  // non-square
+  CALL_SUBTEST(test_spmv<Scalar>(64, 128));  // wide
+  CALL_SUBTEST(test_spmv_alpha_beta<Scalar>(64));
+  CALL_SUBTEST(test_spmv_transpose<Scalar>(128, 64));
+  CALL_SUBTEST(test_spmm<Scalar>(64, 64, 4));
+  CALL_SUBTEST(test_identity<Scalar>(64));
+  CALL_SUBTEST(test_reuse<Scalar>(64));
+  CALL_SUBTEST(test_empty<Scalar>());
+}
+
+EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {
+  CALL_SUBTEST(test_scalar<float>());
+  CALL_SUBTEST(test_scalar<double>());
+  CALL_SUBTEST(test_scalar<std::complex<float>>());
+  CALL_SUBTEST(test_scalar<std::complex<double>>());
+}
--- a/test/gpu_device_matrix.cpp
+++ b/test/gpu_device_matrix.cpp
@@ -0,0 +1,245 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for DeviceMatrix and HostTransfer: typed RAII GPU memory wrapper.
+// No cuSOLVER dependency — only CUDA runtime.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Default construction ---------------------------------------------------
+
+void test_default_construct() {
+  DeviceMatrix<double> dm;
+  VERIFY(dm.empty());
+  VERIFY_IS_EQUAL(dm.rows(), 0);
+  VERIFY_IS_EQUAL(dm.cols(), 0);
+  VERIFY(dm.data() == nullptr);
+  VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(0));
+}
+
+// ---- Allocate uninitialized -------------------------------------------------
+
+template <typename Scalar>
+void test_allocate(Index rows, Index cols) {
+  DeviceMatrix<Scalar> dm(rows, cols);
+  VERIFY(!dm.empty());
+  VERIFY_IS_EQUAL(dm.rows(), rows);
+  VERIFY_IS_EQUAL(dm.cols(), cols);
+  VERIFY(dm.data() != nullptr);
+  VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar));
+}
+
+// ---- fromHost / toHost roundtrip (synchronous) ------------------------------
+
+template <typename Scalar>
+void test_roundtrip(Index rows, Index cols) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  MatrixType host = MatrixType::Random(rows, cols);
+
+  auto dm = DeviceMatrix<Scalar>::fromHost(host);
+  VERIFY_IS_EQUAL(dm.rows(), rows);
+  VERIFY_IS_EQUAL(dm.cols(), cols);
+  VERIFY(!dm.empty());
+
+  MatrixType result = dm.toHost();
+  VERIFY_IS_EQUAL(result.rows(), rows);
+  VERIFY_IS_EQUAL(result.cols(), cols);
+  VERIFY_IS_APPROX(result, host);
+}
+
+// ---- fromHostAsync / toHostAsync roundtrip -----------------------------------
+
+template <typename Scalar>
+void test_roundtrip_async(Index rows, Index cols) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  MatrixType host = MatrixType::Random(rows, cols);
+
+  cudaStream_t stream;
+  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream));
+
+  // Async upload from raw pointer.
+  auto dm = DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, stream);
+  VERIFY_IS_EQUAL(dm.rows(), rows);
+  VERIFY_IS_EQUAL(dm.cols(), cols);
+
+  // Async download via HostTransfer future.
+  auto transfer = dm.toHostAsync(stream);
+
+  // get() blocks and returns the matrix.
+  MatrixType result = transfer.get();
+  VERIFY_IS_APPROX(result, host);
+
+  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(stream));
+}
+
+// ---- HostTransfer::ready() and idempotent get() -----------------------------
+
+void test_host_transfer_ready() {
+  using MatrixType = Matrix<double, Dynamic, Dynamic>;
+  MatrixType host = MatrixType::Random(100, 100);
+
+  auto dm = DeviceMatrix<double>::fromHost(host);
+  auto transfer = dm.toHostAsync();
+
+  // After get(), ready() must return true.
+  MatrixType result = transfer.get();
+  VERIFY(transfer.ready());
+  VERIFY_IS_APPROX(result, host);
+
+  // get() is idempotent.
+  MatrixType& result2 = transfer.get();
+  VERIFY_IS_APPROX(result2, host);
+}
+
+// ---- HostTransfer move ------------------------------------------------------
+
+void test_host_transfer_move() {
+  using MatrixType = Matrix<double, Dynamic, Dynamic>;
+  MatrixType host = MatrixType::Random(50, 50);
+
+  auto dm = DeviceMatrix<double>::fromHost(host);
+  auto transfer = dm.toHostAsync();
+
+  HostTransfer<double> moved(std::move(transfer));
+  MatrixType result = moved.get();
+  VERIFY_IS_APPROX(result, host);
+}
+
+// ---- clone() produces independent copy --------------------------------------
+
+template <typename Scalar>
+void test_clone(Index rows, Index cols) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  MatrixType host = MatrixType::Random(rows, cols);
+
+  auto dm = DeviceMatrix<Scalar>::fromHost(host);
+  auto cloned = dm.clone();
+
+  // Overwrite original with different data.
+  MatrixType other = MatrixType::Random(rows, cols);
+  dm = DeviceMatrix<Scalar>::fromHost(other);
+
+  // Clone still holds the original data.
+  MatrixType clone_result = cloned.toHost();
+  VERIFY_IS_APPROX(clone_result, host);
+
+  // Original holds the new data.
+  MatrixType dm_result = dm.toHost();
+  VERIFY_IS_APPROX(dm_result, other);
+}
+
+// ---- Move construct ---------------------------------------------------------
+
+template <typename Scalar>
+void test_move_construct(Index rows, Index cols) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  MatrixType host = MatrixType::Random(rows, cols);
+
+  auto dm = DeviceMatrix<Scalar>::fromHost(host);
+  DeviceMatrix<Scalar> moved(std::move(dm));
+
+  VERIFY(dm.empty());
+  VERIFY(dm.data() == nullptr);
+
+  VERIFY_IS_EQUAL(moved.rows(), rows);
+  VERIFY_IS_EQUAL(moved.cols(), cols);
+  MatrixType result = moved.toHost();
+  VERIFY_IS_APPROX(result, host);
+}
+
+// ---- Move assign ------------------------------------------------------------
+
+template <typename Scalar>
+void test_move_assign(Index rows, Index cols) {
+  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
+  MatrixType host = MatrixType::Random(rows, cols);
+
+  auto dm = DeviceMatrix<Scalar>::fromHost(host);
+  DeviceMatrix<Scalar> dest;
+  dest = std::move(dm);
+
+  VERIFY(dm.empty());
+  VERIFY_IS_EQUAL(dest.rows(), rows);
+  MatrixType result = dest.toHost();
+  VERIFY_IS_APPROX(result, host);
+}
+
+// ---- resize() ---------------------------------------------------------------
+
+void test_resize() {
+  DeviceMatrix<double> dm(10, 20);
+  VERIFY_IS_EQUAL(dm.rows(), 10);
+  VERIFY_IS_EQUAL(dm.cols(), 20);
+
+  dm.resize(50, 30);
+  VERIFY_IS_EQUAL(dm.rows(), 50);
+  VERIFY_IS_EQUAL(dm.cols(), 30);
+  VERIFY(dm.data() != nullptr);
+
+  // Resize to same dimensions is a no-op.
+  double* ptr_before = dm.data();
+  dm.resize(50, 30);
+  VERIFY(dm.data() == ptr_before);
+}
+
+// ---- Empty / 0x0 matrix -----------------------------------------------------
+
+void test_empty() {
+  using MatrixType = Matrix<double, Dynamic, Dynamic>;
+  MatrixType empty_mat(0, 0);
+
+  auto dm = DeviceMatrix<double>::fromHost(empty_mat);
+  VERIFY(dm.empty());
+  VERIFY_IS_EQUAL(dm.rows(), 0);
+  VERIFY_IS_EQUAL(dm.cols(), 0);
+
+  MatrixType result = dm.toHost();
+  VERIFY_IS_EQUAL(result.rows(), 0);
+  VERIFY_IS_EQUAL(result.cols(), 0);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  // Square.
+  CALL_SUBTEST(test_roundtrip<Scalar>(1, 1));
+  CALL_SUBTEST(test_roundtrip<Scalar>(64, 64));
+  CALL_SUBTEST(test_roundtrip<Scalar>(256, 256));
+
+  // Rectangular.
+  CALL_SUBTEST(test_roundtrip<Scalar>(100, 7));
+  CALL_SUBTEST(test_roundtrip<Scalar>(7, 100));
+
+  // Async roundtrip.
+  CALL_SUBTEST(test_roundtrip_async<Scalar>(64, 64));
+  CALL_SUBTEST(test_roundtrip_async<Scalar>(100, 7));
+
+  CALL_SUBTEST(test_clone<Scalar>(64, 64));
+  CALL_SUBTEST(test_move_construct<Scalar>(64, 64));
+  CALL_SUBTEST(test_move_assign<Scalar>(64, 64));
+}
+
+EIGEN_DECLARE_TEST(gpu_device_matrix) {
+  CALL_SUBTEST(test_default_construct());
+  CALL_SUBTEST(test_empty());
+  CALL_SUBTEST(test_resize());
+  CALL_SUBTEST(test_host_transfer_ready());
+  CALL_SUBTEST(test_host_transfer_move());
+  CALL_SUBTEST((test_allocate<float>(100, 50)));
+  CALL_SUBTEST((test_allocate<double>(100, 50)));
+  CALL_SUBTEST(test_scalar<float>());
+  CALL_SUBTEST(test_scalar<double>());
+  CALL_SUBTEST(test_scalar<std::complex<float>>());
+  CALL_SUBTEST(test_scalar<std::complex<double>>());
+}
--- a/test/gpu_library_example.cu
+++ b/test/gpu_library_example.cu
@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Smoke test for GPU library test infrastructure.
+// Verifies GpuContext, GpuBuffer, and host<->device matrix transfers
+// without requiring any NVIDIA library (cuBLAS, cuSOLVER, etc.).
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include "gpu_context.h"
+#include "gpu_library_test_helper.h"
+
+using namespace Eigen;
+using namespace Eigen::test;
+
+// Test that GpuContext initializes, reports valid device info, and owns a cuSOLVER handle.
+void test_gpu_context() {
+  GpuContext ctx;
+  VERIFY(ctx.device() >= 0);
+  VERIFY(ctx.deviceProperties().major >= 7);  // sm_70 minimum
+  VERIFY(ctx.stream != nullptr);
+  VERIFY(ctx.cusolver != nullptr);
+  std::cout << "  GPU: " << ctx.deviceProperties().name << " (sm_" << ctx.deviceProperties().major
+            << ctx.deviceProperties().minor << ")\n";
+}
+
+// Test dense matrix roundtrip: host -> device -> host.
+template <typename MatrixType>
+void test_dense_roundtrip() {
+  GpuContext ctx;
+  const Index rows = 64;
+  const Index cols = 32;
+
+  MatrixType A = MatrixType::Random(rows, cols);
+  auto buf = gpu_copy_to_device(ctx.stream, A);
+  VERIFY(buf.data != nullptr);
+  VERIFY(buf.size == rows * cols);
+
+  MatrixType B(rows, cols);
+  B.setZero();
+  gpu_copy_to_host(ctx.stream, buf, B);
+  ctx.synchronize();
+
+  VERIFY_IS_EQUAL(A, B);
+}
+
+// Test GpuBuffer RAII: move semantics, async zero-init.
+void test_gpu_buffer() {
+  GpuContext ctx;
+
+  GpuBuffer<float> a(128);
+  VERIFY(a.data != nullptr);
+  VERIFY(a.size == 128);
+
+  // Move construction.
+  GpuBuffer<float> b(std::move(a));
+  VERIFY(a.data == nullptr);
+  VERIFY(b.data != nullptr);
+  VERIFY(b.size == 128);
+
+  // Move assignment.
+  GpuBuffer<float> c;
+  c = std::move(b);
+  VERIFY(b.data == nullptr);
+  VERIFY(c.data != nullptr);
+
+  // setZeroAsync.
+  c.setZeroAsync(ctx.stream);
+  ctx.synchronize();
+
+  std::vector<float> host(128);
+  GPU_CHECK(cudaMemcpy(host.data(), c.data, 128 * sizeof(float), cudaMemcpyDeviceToHost));
+  for (int i = 0; i < 128; ++i) {
+    VERIFY_IS_EQUAL(host[i], 0.0f);
+  }
+}
+
+// Test with vectors (1D).
+template <typename Scalar>
+void test_vector_roundtrip() {
+  GpuContext ctx;
+  const Index n = 256;
+  Matrix<Scalar, Dynamic, 1> v = Matrix<Scalar, Dynamic, 1>::Random(n);
+  auto buf = gpu_copy_to_device(ctx.stream, v);
+
+  Matrix<Scalar, Dynamic, 1> w(n);
+  w.setZero();
+  gpu_copy_to_host(ctx.stream, buf, w);
+  ctx.synchronize();
+
+  VERIFY_IS_EQUAL(v, w);
+}
+
+EIGEN_DECLARE_TEST(gpu_library_example) {
+  CALL_SUBTEST(test_gpu_context());
+  CALL_SUBTEST(test_gpu_buffer());
+  CALL_SUBTEST(test_dense_roundtrip<MatrixXf>());
+  CALL_SUBTEST(test_dense_roundtrip<MatrixXd>());
+  CALL_SUBTEST((test_dense_roundtrip<Matrix<float, Dynamic, Dynamic, RowMajor>>()));
+  CALL_SUBTEST((test_dense_roundtrip<Matrix<double, Dynamic, Dynamic, RowMajor>>()));
+  CALL_SUBTEST(test_vector_roundtrip<float>());
+  CALL_SUBTEST(test_vector_roundtrip<double>());
+  CALL_SUBTEST(test_vector_roundtrip<std::complex<float>>());
+}
--- a/test/gpu_library_test_helper.h
+++ b/test/gpu_library_test_helper.h
@@ -0,0 +1,90 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
+#define EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
+
+// Helpers for GPU tests that call NVIDIA library APIs (cuBLAS, cuSOLVER, etc.)
+// from the host side. Provides RAII GPU memory management and async matrix transfer.
+//
+// This is separate from gpu_common.h (element-parallel device kernels) and
+// gpu_test_helper.h (serialization-based device kernels). Those patterns run
+// user functors inside GPU kernels. This helper is for host-orchestrated tests
+// that call library APIs which launch their own kernels internally.
+//
+// All transfers use an explicit stream and cudaMemcpyAsync. Callers must
+// synchronize (ctx.synchronize() or cudaStreamSynchronize) before reading
+// results back on the host.
+
+#include "gpu_test_helper.h"
+
+namespace Eigen {
+namespace test {
+
+// RAII wrapper for GPU device memory. Prevents leaks when VERIFY macros abort.
+template <typename Scalar>
+struct GpuBuffer {
+  Scalar* data = nullptr;
+  Index size = 0;
+
+  GpuBuffer() = default;
+
+  explicit GpuBuffer(Index n) : size(n) { GPU_CHECK(gpuMalloc(reinterpret_cast<void**>(&data), n * sizeof(Scalar))); }
+
+  ~GpuBuffer() {
+    if (data) GPU_CHECK(gpuFree(data));
+  }
+
+  // Move-only.
+  GpuBuffer(GpuBuffer&& other) noexcept : data(other.data), size(other.size) {
+    other.data = nullptr;
+    other.size = 0;
+  }
+  GpuBuffer& operator=(GpuBuffer&& other) noexcept {
+    if (this != &other) {
+      if (data) GPU_CHECK(gpuFree(data));
+      data = other.data;
+      size = other.size;
+      other.data = nullptr;
+      other.size = 0;
+    }
+    return *this;
+  }
+
+  GpuBuffer(const GpuBuffer&) = delete;
+  GpuBuffer& operator=(const GpuBuffer&) = delete;
+
+  // Async zero the buffer on the given stream.
+  void setZeroAsync(cudaStream_t stream) { GPU_CHECK(cudaMemsetAsync(data, 0, size * sizeof(Scalar), stream)); }
+};
+
+// Copy a dense Eigen matrix to a new GPU buffer, async on the given stream.
+// Caller must synchronize before the host matrix is freed or modified.
+template <typename Derived>
+GpuBuffer<typename Derived::Scalar> gpu_copy_to_device(cudaStream_t stream, const MatrixBase<Derived>& host_mat) {
+  using Scalar = typename Derived::Scalar;
+  const auto& mat = host_mat.derived();
+  GpuBuffer<Scalar> buf(mat.size());
+  GPU_CHECK(cudaMemcpyAsync(buf.data, mat.data(), mat.size() * sizeof(Scalar), cudaMemcpyHostToDevice, stream));
+  return buf;
+}
+
+// Copy GPU buffer contents back to a dense Eigen matrix, async on the given stream.
+// Caller must synchronize before reading from host_mat.
+template <typename Scalar, typename Derived>
+void gpu_copy_to_host(cudaStream_t stream, const GpuBuffer<Scalar>& buf, MatrixBase<Derived>& host_mat) {
+  auto& mat = host_mat.derived();
+  eigen_assert(buf.size == mat.size());
+  GPU_CHECK(cudaMemcpyAsync(mat.data(), buf.data, mat.size() * sizeof(Scalar), cudaMemcpyDeviceToHost, stream));
+}
+
+}  // namespace test
+}  // namespace Eigen
+
+#endif  // EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
--- a/test/gpu_test_helper.h
+++ b/test/gpu_test_helper.h
@@ -6,10 +6,8 @@
 // Allow gpu** macros for generic tests.
 #include <Eigen/src/Core/util/GpuHipCudaDefines.inc>

-// std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that
-// doesn't allow std::tuple to compile for host code either. In these cases,
-// use our custom implementation.
-#if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000)
+// std::tuple cannot be used on device, so use our custom implementation there.
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 #define EIGEN_USE_CUSTOM_TUPLE 1
 #else
 #define EIGEN_USE_CUSTOM_TUPLE 0
@@ -42,6 +40,12 @@ using tuple_impl::tuple;
 #undef EIGEN_USE_CUSTOM_TUPLE
 }  // namespace test_detail

+template <typename T>
+using decay_t = typename std::decay<T>::type;
+
+template <typename Func, typename... Args>
+using kernel_result_t = decltype(std::declval<Func>()(std::declval<Args>()...));
+
 template <size_t N, size_t Idx, typename OutputIndexSequence, typename... Ts>
 struct extract_output_indices_helper;

@@ -90,14 +94,15 @@ struct void_helper {
  // Non-void return value.
  template <typename Func, typename... Args>
  static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)
-      -> std::enable_if_t<!std::is_same<decltype(func(args...)), void>::value, decltype(func(args...))> {
+      -> std::enable_if_t<!std::is_same<kernel_result_t<Func&&, Args&&...>, void>::value,
+                          kernel_result_t<Func&&, Args&&...>> {
    return func(std::forward<Args>(args)...);
  }

  // Void return value.
  template <typename Func, typename... Args>
  static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)
-      -> std::enable_if_t<std::is_same<decltype(func(args...)), void>::value, Void> {
+      -> std::enable_if_t<std::is_same<kernel_result_t<Func&&, Args&&...>, void>::value, Void> {
    func(std::forward<Args>(args)...);
    return Void{};
  }
@@ -135,18 +140,18 @@ EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence<Indices...>, std::inde
  const uint8_t* read_end = buffer + capacity;
  read_ptr = Eigen::deserialize(read_ptr, read_end, input_size);
  // Create value-type instances to populate.
-  auto args = make_tuple(typename std::decay<Args>::type{}...);
+  auto args = make_tuple(decay_t<Args>{}...);
  EIGEN_UNUSED_VARIABLE(args);  // Avoid NVCC compile warning.
  // NVCC 9.1 requires us to spell out the template parameters explicitly.
-  read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, typename std::decay<Args>::type...>(args)...);
+  read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, decay_t<Args>...>(args)...);

  // Call function, with void->Void conversion so we are guaranteed a complete
  // output type.
-  auto result = void_helper::call(kernel, get<Indices, typename std::decay<Args>::type...>(args)...);
+  auto result = void_helper::call(kernel, get<Indices, decay_t<Args>...>(args)...);

  // Determine required output size.
  size_t output_size = Eigen::serialize_size(capacity);
-  output_size += Eigen::serialize_size(get<OutputIndices, typename std::decay<Args>::type...>(args)...);
+  output_size += Eigen::serialize_size(get<OutputIndices, decay_t<Args>...>(args)...);
  output_size += Eigen::serialize_size(result);

  // Always serialize required buffer size.
@@ -157,7 +162,7 @@ EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence<Indices...>, std::inde
  // Serialize outputs if they fit in the buffer.
  if (output_size <= capacity) {
    // Collect outputs and result.
-    write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, typename std::decay<Args>::type...>(args)...);
+    write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, decay_t<Args>...>(args)...);
    write_ptr = Eigen::serialize(write_ptr, write_end, result);
  }
 }
@@ -282,7 +287,7 @@ auto run_serialized_on_gpu(size_t buffer_capacity_hint, std::index_sequence<Indi
 * \return kernel(args...).
 */
 template <typename Kernel, typename... Args>
-auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run_on_cpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
  return kernel(std::forward<Args>(args)...);
 }

@@ -301,7 +306,7 @@ auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
 * \return kernel(args...).
 */
 template <typename Kernel, typename... Args>
-auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run_on_gpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
  return internal::run_serialized_on_gpu<Kernel, Args...>(
      /*buffer_capacity_hint=*/0, std::make_index_sequence<sizeof...(Args)>{},
      internal::extract_output_indices<Args...>{}, kernel, std::forward<Args>(args)...);
@@ -322,7 +327,8 @@ auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
 * \sa run_on_gpu
 */
 template <typename Kernel, typename... Args>
-auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args)
+    -> internal::kernel_result_t<Kernel, Args&&...> {
  return internal::run_serialized_on_gpu<Kernel, Args...>(
      buffer_capacity_hint, std::make_index_sequence<sizeof...(Args)>{}, internal::extract_output_indices<Args...>{},
      kernel, std::forward<Args>(args)...);
@@ -409,7 +415,7 @@ void print_gpu_device_info() {
 * \return kernel(args...).
 */
 template <typename Kernel, typename... Args>
-auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
 #ifdef EIGEN_GPUCC
  return run_on_gpu(kernel, std::forward<Args>(args)...);
 #else
@@ -432,7 +438,8 @@ auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
 * \sa run
 */
 template <typename Kernel, typename... Args>
-auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args)
+    -> internal::kernel_result_t<Kernel, Args&&...> {
 #ifdef EIGEN_GPUCC
  return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward<Args>(args)...);
 #else
--- a/test/main.h
+++ b/test/main.h
@@ -76,10 +76,8 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
-#if CUDA_VERSION >= 7050
 #include <cuda_fp16.h>
 #endif
-#endif

 #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
 #define EIGEN_TEST_NO_LONGDOUBLE
@@ -949,6 +947,37 @@ inline void set_seed_from_time() {
  g_seed = static_cast<decltype(g_seed)>(ns);
 }

+#if defined(EIGEN_USE_GPU)
+inline int maybe_skip_gpu_tests() {
+#if defined(EIGEN_USE_HIP)
+  int device_count = 0;
+  hipError_t status = hipGetDeviceCount(&device_count);
+  if (status != hipSuccess) {
+    std::cout << "SKIP: HIP GPU tests require a visible ROCm device. hipGetDeviceCount failed with: "
+              << hipGetErrorString(status) << std::endl;
+    return 77;
+  }
+  if (device_count <= 0) {
+    std::cout << "SKIP: HIP GPU tests require a visible ROCm device." << std::endl;
+    return 77;
+  }
+#elif defined(EIGEN_CUDACC)
+  int device_count = 0;
+  cudaError_t status = cudaGetDeviceCount(&device_count);
+  if (status != cudaSuccess) {
+    std::cout << "SKIP: CUDA GPU tests require a visible CUDA device. cudaGetDeviceCount failed with: "
+              << cudaGetErrorString(status) << std::endl;
+    return 77;
+  }
+  if (device_count <= 0) {
+    std::cout << "SKIP: CUDA GPU tests require a visible CUDA device." << std::endl;
+    return 77;
+  }
+#endif
+  return 0;
+}
+#endif
+
 int main(int argc, char* argv[]) {
  g_has_set_repeat = false;
  g_has_set_seed = false;
@@ -997,6 +1026,13 @@ int main(int argc, char* argv[]) {
  srand(g_seed);
  std::cout << "Repeating each test " << g_repeat << " times" << std::endl;

+#if defined(EIGEN_USE_GPU)
+  {
+    const int skip_code = maybe_skip_gpu_tests();
+    if (skip_code != 0) return skip_code;
+  }
+#endif
+
  VERIFY(EigenTest::all().size() > 0);

  for (std::size_t i = 0; i < EigenTest::all().size(); ++i) {
--- a/unsupported/Eigen/src/Tensor/TensorContractionGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorContractionGpu.h
@@ -393,7 +393,8 @@ __device__ EIGEN_STRONG_INLINE void EigenContractionKernelInternal(const LhsMapp
  // the sum across all big k blocks of the product of little k block of index (x, y)
  // with block of index (y, z). To compute the final output, we need to reduce
  // the 8 threads over y by summation.
-#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+  // HIP uses non-sync warp shuffles; CUDA requires the _sync variants.
+#if defined(EIGEN_HIPCC)
 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
 #else
 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
@@ -622,7 +623,7 @@ __device__ __forceinline__ void EigenFloatContractionKernelInternal16x16(const L
      x1 = rhs_pf0.x;
      x2 = rhs_pf0.z;
    }
-#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+#if defined(EIGEN_HIPCC)
    x1 = __shfl_xor(x1, 4);
    x2 = __shfl_xor(x2, 4);
 #else
@@ -1377,13 +1378,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
                  this->m_right_contracting_strides, this->m_k_strides);

    OutputMapper output(buffer, m);
-
-#if defined(EIGEN_USE_HIP)
-    setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
-#else
-    setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
-#endif
-
    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k,
                                                                                        this->m_device);
  }
--- a/unsupported/Eigen/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/src/Tensor/TensorConvolution.h
@@ -89,7 +89,7 @@ class IndexMapper {
      }
    } else {
      for (int i = NumDims - 1; i >= 0; --i) {
-        if (static_cast<size_t>(i + 1) < offset) {
+        if (i + 1 < static_cast<int>(offset)) {
          m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
          m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
        } else {
--- a/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h
@@ -342,19 +342,6 @@ struct GpuDevice {

 #endif

-// FIXME: Should be device and kernel specific.
-#ifdef EIGEN_GPUCC
-static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
-#ifndef EIGEN_GPU_COMPILE_PHASE
-  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
-  EIGEN_UNUSED_VARIABLE(status);
-  gpu_assert(status == gpuSuccess);
-#else
-  EIGEN_UNUSED_VARIABLE(config);
-#endif
-}
-#endif
-
 }  // end namespace Eigen

 // undefine all the gpu* macros we defined at the beginning of the file
--- a/unsupported/Eigen/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/src/Tensor/TensorEvaluator.h
@@ -175,7 +175,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T loadConstant(const T* address) {
  return *address;
 }
 // Use the texture cache on CUDA devices whenever possible
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_CUDA_ARCH)
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) {
  return __ldg(address);
--- a/unsupported/Eigen/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/src/Tensor/TensorMeta.h
@@ -49,7 +49,7 @@ struct PacketType : internal::packet_traits<Scalar> {
 };

 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPU_COMPILE_PHASE)

 typedef ulonglong2 Packet4h2;
 template <>
--- a/unsupported/Eigen/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/src/Tensor/TensorReduction.h
@@ -453,7 +453,7 @@ template <int B, int N, typename S, typename R, typename I_>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*,
                                                                 unsigned int*);

-#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_GPUCC)
 template <typename S, typename R, typename I_>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
    R, const S, I_, internal::packet_traits<half>::type*);
@@ -883,7 +883,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
  template <int B, int N, typename S, typename R, typename I_>
  KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
-#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_GPUCC)
  template <typename S, typename R, typename I_>
  KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_,
                                                                     internal::packet_traits<Eigen::half>::type*);
--- a/unsupported/Eigen/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorReductionGpu.h
@@ -25,7 +25,6 @@ namespace internal {
 // updated the content of the output address it will try again.
 template <typename T, typename R>
 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  if (sizeof(T) == 4) {
    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
    unsigned int newval = oldval;
@@ -61,12 +60,6 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
  } else {
    gpu_assert(0 && "Wordsize not supported");
  }
-#else   // EIGEN_CUDA_ARCH >= 300
-  EIGEN_UNUSED_VARIABLE(output);
-  EIGEN_UNUSED_VARIABLE(accum);
-  EIGEN_UNUSED_VARIABLE(reducer);
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif  // EIGEN_CUDA_ARCH >= 300
 }

 // We extend atomicExch to support extra data types
@@ -75,13 +68,42 @@ __device__ inline Type atomicExchCustom(Type* address, Type val) {
  return atomicExch(address, val);
 }

+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR auto reduction_shuffle_mask() {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  return 0xFFFFFFFFFFFFFFFFull;
+#else
+  return 0xFFFFFFFFu;
+#endif
+}
+
+template <typename T>
+__device__ EIGEN_ALWAYS_INLINE T reduction_shuffle_down(T value, int offset) {
+  return __shfl_down_sync(reduction_shuffle_mask<T>(), value, offset, warpSize);
+}
+
+template <>
+__device__ EIGEN_ALWAYS_INLINE int reduction_shuffle_down<int>(int value, int offset) {
+  return __shfl_down_sync(reduction_shuffle_mask<int>(), value, offset, warpSize);
+}
+
+template <>
+__device__ EIGEN_ALWAYS_INLINE float reduction_shuffle_down<float>(float value, int offset) {
+  return __shfl_down_sync(reduction_shuffle_mask<float>(), value, offset, warpSize);
+}
+
+template <>
+__device__ EIGEN_ALWAYS_INLINE double reduction_shuffle_down<double>(double value, int offset) {
+  return __shfl_down_sync(reduction_shuffle_mask<double>(), value, offset, warpSize);
+}
+
 template <>
 __device__ inline double atomicExchCustom(double* address, double val) {
  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
 }

-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.
 template <typename R>
 __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
@@ -111,17 +133,10 @@ __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reduc
  }
 }
 #endif  // EIGEN_GPU_COMPILE_PHASE
-#endif  // EIGEN_HAS_GPU_FP16

 template <>
 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  atomicAdd(output, accum);
-#else   // EIGEN_CUDA_ARCH >= 300
-  EIGEN_UNUSED_VARIABLE(output);
-  EIGEN_UNUSED_VARIABLE(accum);
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif  // EIGEN_CUDA_ARCH >= 300
 }

 template <typename CoeffType, typename Index>
@@ -138,7 +153,6 @@ template <int BlockSize, int NumPerThread, typename Self, typename Reducer, type
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
                                                                 typename Self::CoeffReturnType* output,
                                                                 unsigned int* semaphore) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  // Initialize the output value
  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
  if (gridDim.x == 1) {
@@ -179,20 +193,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer

 #pragma unroll
  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-#if defined(EIGEN_HIPCC)
-    // use std::is_floating_point to determine the type of reduced_val
-    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error
-    // and list the float and int versions of __shfl_down as the candidate functions.
-    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
-      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
-    } else {
-      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
-    }
-#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
-#else
-    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
-#endif
+    reducer.reduce(reduction_shuffle_down(accum, offset), &accum);
  }

  if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -206,17 +207,9 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer
    __threadfence_system();
 #endif
  }
-#else   // EIGEN_CUDA_ARCH >= 300
-  EIGEN_UNUSED_VARIABLE(reducer);
-  EIGEN_UNUSED_VARIABLE(input);
-  EIGEN_UNUSED_VARIABLE(num_coeffs);
-  EIGEN_UNUSED_VARIABLE(output);
-  EIGEN_UNUSED_VARIABLE(semaphore);
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif  // EIGEN_CUDA_ARCH >= 300
 }

-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.
 template <typename Self, typename Reducer, typename Index>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input,
                                                                                   Index num_coeffs, half* scratch) {
@@ -319,14 +312,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce
      hr[i] = wka_out.h;
    }
    reducer.reducePacket(r1, &accum);
-#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-    PacketType r1;
-    half2* hr = reinterpret_cast<half2*>(&r1);
-    half2* hacc = reinterpret_cast<half2*>(&accum);
-    for (int i = 0; i < packet_width / 2; i++) {
-      hr[i] = __shfl_down(hacc[i], offset, warpSize);
-    }
-    reducer.reducePacket(r1, &accum);
 #else
    PacketType r1;
    half2* hr = reinterpret_cast<half2*>(&r1);
@@ -377,8 +362,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op
  }
 }

-#endif  // EIGEN_HAS_GPU_FP16
-
 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
 struct FullReductionLauncher {
  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
@@ -409,7 +392,7 @@ struct FullReductionLauncher<
  }
 };

-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.
 template <typename Self, typename Op>
 struct FullReductionLauncher<Self, Op, Eigen::half, false> {
  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
@@ -443,24 +426,18 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
    }
  }
 };
-#endif  // EIGEN_HAS_GPU_FP16

 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
  // Unfortunately nvidia doesn't support well exotic types such as complex,
  // so reduce the scope of the optimized version of the code to the simple cases
  // of doubles, floats and half floats
-#ifdef EIGEN_HAS_GPU_FP16
+  // Half-float reduction specializations.
  static constexpr bool HasOptimizedImplementation =
      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
                                           internal::is_same<typename Self::CoeffReturnType, double>::value ||
                                           (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
                                            reducer_traits<Op, GpuDevice>::PacketAccess));
-#else   // EIGEN_HAS_GPU_FP16
-  static constexpr bool HasOptimizedImplementation =
-      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                           internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif  // EIGEN_HAS_GPU_FP16

  template <typename OutputType>
  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
@@ -481,7 +458,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce
                                                                  Index num_coeffs_to_reduce,
                                                                  Index num_preserved_coeffs,
                                                                  typename Self::CoeffReturnType* output) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  typedef typename Self::CoeffReturnType Type;
  eigen_assert(blockDim.y == 1);
  eigen_assert(blockDim.z == 1);
@@ -534,20 +510,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce

 #pragma unroll
      for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-#if defined(EIGEN_HIPCC)
-        // use std::is_floating_point to determine the type of reduced_val
-        // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error
-        // and list the float and int versions of __shfl_down as the candidate functions.
-        if (std::is_floating_point<Type>::value) {
-          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
-        } else {
-          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
-        }
-#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
-#else
-        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
-#endif
+        reducer.reduce(reduction_shuffle_down(reduced_val, offset), &reduced_val);
      }

      if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -555,17 +518,9 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce
      }
    }
  }
-#else   // EIGEN_CUDA_ARCH >= 300
-  EIGEN_UNUSED_VARIABLE(reducer);
-  EIGEN_UNUSED_VARIABLE(input);
-  EIGEN_UNUSED_VARIABLE(num_coeffs_to_reduce);
-  EIGEN_UNUSED_VARIABLE(num_preserved_coeffs);
-  EIGEN_UNUSED_VARIABLE(output);
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif  // EIGEN_CUDA_ARCH >= 300
 }

-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.

 template <int NumPerThread, typename Self, typename Reducer, typename Index>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
@@ -688,19 +643,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
        }
        reducer.reducePacket(r1, &reduced_val1);
        reducer.reducePacket(r2, &reduced_val2);
-#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-        PacketType r1;
-        PacketType r2;
-        half2* hr1 = reinterpret_cast<half2*>(&r1);
-        half2* hr2 = reinterpret_cast<half2*>(&r2);
-        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
-        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
-        for (int i = 0; i < packet_width / 2; i++) {
-          hr1[i] = __shfl_down(rv1[i], offset, warpSize);
-          hr2[i] = __shfl_down(rv2[i], offset, warpSize);
-        }
-        reducer.reducePacket(r1, &reduced_val1);
-        reducer.reducePacket(r2, &reduced_val2);
 #else
        PacketType r1;
        PacketType r2;
@@ -741,8 +683,6 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
  }
 }

-#endif  // EIGEN_HAS_GPU_FP16
-
 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
 struct InnerReductionLauncher {
  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index,
@@ -786,7 +726,7 @@ struct InnerReductionLauncher<
  }
 };

-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.
 template <typename Self, typename Op>
 struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
@@ -826,24 +766,18 @@ struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
    return false;
  }
 };
-#endif  // EIGEN_HAS_GPU_FP16

 template <typename Self, typename Op>
 struct InnerReducer<Self, Op, GpuDevice> {
  // Unfortunately nvidia doesn't support well exotic types such as complex,
  // so reduce the scope of the optimized version of the code to the simple case
  // of floats and half floats.
-#ifdef EIGEN_HAS_GPU_FP16
+  // Half-float reduction specializations.
  static constexpr bool HasOptimizedImplementation =
      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
                                           internal::is_same<typename Self::CoeffReturnType, double>::value ||
                                           (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
                                            reducer_traits<Op, GpuDevice>::PacketAccess));
-#else   // EIGEN_HAS_GPU_FP16
-  static constexpr bool HasOptimizedImplementation =
-      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                           internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif  // EIGEN_HAS_GPU_FP16

  template <typename OutputType>
  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -237,7 +237,7 @@ if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MS
  ei_add_test(cxx11_tensor_uint128)
 endif()

-find_package(CUDA 9.0)
+find_package(CUDA 11.4)
 if(CUDA_FOUND AND EIGEN_TEST_CUDA)
  # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
  # and -fno-check-new flags since they trigger thousands of compilation warnings
@@ -281,26 +281,11 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
  ei_add_test(cxx11_tensor_argmax_gpu)
  ei_add_test(cxx11_tensor_cast_float16_gpu)
  ei_add_test(cxx11_tensor_scan_gpu)
-
-  set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH 9999)
-  foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
-    if(${ARCH} LESS ${EIGEN_CUDA_OLDEST_COMPUTE_ARCH})
-      set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH ${ARCH})
-    endif()
-  endforeach()
-
-  # Contractions require arch 3.0 or higher
-  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 29)
-    ei_add_test(cxx11_tensor_device)
-    ei_add_test(cxx11_tensor_gpu)
-    ei_add_test(cxx11_tensor_contract_gpu)
-    ei_add_test(cxx11_tensor_of_float16_gpu)
-  endif()
-
-  # The random number generation code requires arch 3.5 or greater.
-  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 34)
-    ei_add_test(cxx11_tensor_random_gpu)
-  endif()
+  ei_add_test(cxx11_tensor_device)
+  ei_add_test(cxx11_tensor_gpu)
+  ei_add_test(cxx11_tensor_contract_gpu)
+  ei_add_test(cxx11_tensor_of_float16_gpu)
+  ei_add_test(cxx11_tensor_random_gpu)

  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
@@ -341,7 +326,6 @@ if (EIGEN_TEST_HIP)
      ei_add_test(cxx11_tensor_cast_float16_gpu)
      ei_add_test(cxx11_tensor_scan_gpu)
      ei_add_test(cxx11_tensor_device)
-
      ei_add_test(cxx11_tensor_gpu)
      ei_add_test(cxx11_tensor_contract_gpu)
      ei_add_test(cxx11_tensor_of_float16_gpu)
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -850,6 +850,7 @@ void test_gpu_igamma() {
  Tensor<Scalar, 2> a(6, 6);
  Tensor<Scalar, 2> x(6, 6);
  Tensor<Scalar, 2> out(6, 6);
+  Tensor<Scalar, 2> expected_out(6, 6);
  out.setZero();

  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
@@ -862,14 +863,11 @@ void test_gpu_igamma() {
    }
  }

-  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-  Scalar igamma_s[][6] = {
-      {0.0, nan, nan, nan, nan, nan},
-      {0.0, 0.6321205588285578, 0.7768698398515702, 0.9816843611112658, 9.999500016666262e-05, 1.0},
-      {0.0, 0.4275932955291202, 0.608374823728911, 0.9539882943107686, 7.522076445089201e-07, 1.0},
-      {0.0, 0.01898815687615381, 0.06564245437845008, 0.5665298796332909, 4.166333347221828e-18, 1.0},
-      {0.0, 0.9999780593618628, 0.9999899967080838, 0.9999996219837988, 0.9991370418689945, 1.0},
-      {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      expected_out(i, j) = numext::igamma(a(i, j), x(i, j));
+    }
+  }

  std::size_t bytes = a.size() * sizeof(Scalar);

@@ -897,10 +895,10 @@ void test_gpu_igamma() {

  for (int i = 0; i < 6; ++i) {
    for (int j = 0; j < 6; ++j) {
-      if ((std::isnan)(igamma_s[i][j])) {
+      if ((std::isnan)(expected_out(i, j))) {
        VERIFY((std::isnan)(out(i, j)));
      } else {
-        VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
+        VERIFY_IS_APPROX(out(i, j), expected_out(i, j));
      }
    }
  }
@@ -915,6 +913,7 @@ void test_gpu_igammac() {
  Tensor<Scalar, 2> a(6, 6);
  Tensor<Scalar, 2> x(6, 6);
  Tensor<Scalar, 2> out(6, 6);
+  Tensor<Scalar, 2> expected_out(6, 6);
  out.setZero();

  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
@@ -927,14 +926,11 @@ void test_gpu_igammac() {
    }
  }

-  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-  Scalar igammac_s[][6] = {
-      {nan, nan, nan, nan, nan, nan},
-      {1.0, 0.36787944117144233, 0.22313016014842982, 0.018315638888734182, 0.9999000049998333, 0.0},
-      {1.0, 0.5724067044708798, 0.3916251762710878, 0.04601170568923136, 0.9999992477923555, 0.0},
-      {1.0, 0.9810118431238462, 0.9343575456215499, 0.4334701203667089, 1.0, 0.0},
-      {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, 3.7801620118431334e-07, 0.0008629581310054535, 0.0},
-      {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      expected_out(i, j) = numext::igammac(a(i, j), x(i, j));
+    }
+  }

  std::size_t bytes = a.size() * sizeof(Scalar);

@@ -962,10 +958,10 @@ void test_gpu_igammac() {

  for (int i = 0; i < 6; ++i) {
    for (int j = 0; j < 6; ++j) {
-      if ((std::isnan)(igammac_s[i][j])) {
+      if ((std::isnan)(expected_out(i, j))) {
        VERIFY((std::isnan)(out(i, j)));
      } else {
-        VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
+        VERIFY_IS_APPROX(out(i, j), expected_out(i, j));
      }
    }
  }
@@ -1068,15 +1064,9 @@ void test_gpu_ndtri() {
  in_x(7) = Scalar(0.99);
  in_x(8) = Scalar(0.01);

-  expected_out(0) = std::numeric_limits<Scalar>::infinity();
-  expected_out(1) = -std::numeric_limits<Scalar>::infinity();
-  expected_out(2) = Scalar(0.0);
-  expected_out(3) = Scalar(-0.8416212335729142);
-  expected_out(4) = Scalar(0.8416212335729142);
-  expected_out(5) = Scalar(1.2815515655446004);
-  expected_out(6) = Scalar(-1.2815515655446004);
-  expected_out(7) = Scalar(2.3263478740408408);
-  expected_out(8) = Scalar(-2.3263478740408408);
+  for (int i = 0; i < 9; ++i) {
+    expected_out(i) = numext::ndtri(in_x(i));
+  }

  std::size_t bytes = in_x.size() * sizeof(Scalar);

@@ -1090,15 +1080,15 @@ void test_gpu_ndtri() {
  Eigen::GpuStreamDevice stream;
  Eigen::GpuDevice gpu_device(&stream);

-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 9);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 9);

  gpu_out.device(gpu_device) = gpu_in_x.ndtri();

  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);

-  for (int i = 0; i < 6; ++i) {
+  for (int i = 0; i < 9; ++i) {
    VERIFY_IS_CWISE_APPROX(out(i), expected_out(i));
  }

@@ -1115,12 +1105,9 @@ void test_gpu_betainc() {
  Tensor<Scalar, 1> expected_out(125);
  out.setZero();

-  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-
  Array<Scalar, 1, Dynamic> x(125);
  Array<Scalar, 1, Dynamic> a(125);
  Array<Scalar, 1, Dynamic> b(125);
-  Array<Scalar, 1, Dynamic> v(125);

  a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
@@ -1160,25 +1147,11 @@ void test_gpu_betainc() {
      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;

-  v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
-      nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
-      0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, 0.999995949033062, 0.9999999999993698,
-      0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan,
-      nan, nan, 0.006827081192655869, 0.0210336989586256, 0.04813160422599567, nan, nan, 0.20014344256217678,
-      0.5000000000000001, 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, 0.9999999999999999, nan,
-      nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
-      1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, nan, 7.864342668429763e-23,
-      3.015969667594166e-10, 0.0008598571564165444, nan, nan, 6.031987710123844e-08, 0.5000000000000007,
-      0.9999999396801229, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan,
-      nan, nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
-      1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan,
-      2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan;
-
  for (int i = 0; i < 125; ++i) {
    in_x(i) = x(i);
    in_a(i) = a(i);
    in_b(i) = b(i);
-    expected_out(i) = v(i);
+    expected_out(i) = numext::betainc(a(i), b(i), x(i));
  }

  std::size_t bytes = in_x.size() * sizeof(Scalar);
--- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
@@ -53,8 +53,6 @@ void test_gpu_numext() {
  gpu_device.deallocate(d_res_float);
 }

-#ifdef EIGEN_HAS_GPU_FP16
-
 template <typename>
 void test_gpu_conversion() {
  Eigen::GpuStreamDevice stream;
@@ -442,12 +440,10 @@ void test_gpu_forced_evals() {
  gpu_device.deallocate(d_res_half2);
  gpu_device.deallocate(d_res_float);
 }
-#endif

 EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) {
  CALL_SUBTEST_1(test_gpu_numext<void>());

-#ifdef EIGEN_HAS_GPU_FP16
  CALL_SUBTEST_1(test_gpu_conversion<void>());
  CALL_SUBTEST_1(test_gpu_unary<void>());
  CALL_SUBTEST_1(test_gpu_elementwise<void>());
@@ -456,7 +452,4 @@ EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) {
  CALL_SUBTEST_3(test_gpu_reductions<void>());
  CALL_SUBTEST_4(test_gpu_full_reductions<void>());
  CALL_SUBTEST_5(test_gpu_forced_evals<void>());
-#else
-  std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
-#endif
 }
Author	SHA1	Message	Date
Rasmus Munk Larsen	43a95b62bb	GPU: Add sparse solvers, FFT, and SpMV (cuDSS, cuFFT, cuSPARSE) Add GPU sparse direct solvers (Cholesky, LDL^T, LU) via cuDSS, 1D/2D FFT via cuFFT with plan caching, and sparse matrix-vector/matrix multiply (SpMV/SpMM) via cuSPARSE. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-09 19:11:49 -07:00
Rasmus Munk Larsen	8593c7f5a1	GPU: Add dense cuSOLVER solvers (QR, SVD, EigenSolver) Add QR (geqrf + ormqr + trsm), SVD (gesvd), and self-adjoint eigenvalue decomposition (syevd) via cuSOLVER. All support host and DeviceMatrix input. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-04-09 19:11:34 -07:00
Rasmus Munk Larsen	58c44ef36d	GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER) Add Eigen/GPU module: A standalone GPU library dispatch layer where DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls. CPU and GPU solvers coexist in the same binary with compatible syntax. Core infrastructure: - DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with async host transfer (fromHost/toHost) and CUDA event-based cross-stream synchronization. - GpuContext: Unified execution context owning a CUDA stream + cuBLAS handle + cuSOLVER handle. Thread-local default with explicit override via setThreadLocal(). Stream-borrowing constructor for integration. - DeviceBuffer: Typed RAII device allocation with move semantics. cuBLAS dispatch (expression syntax): - GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm) - TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm) - SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm) - SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk) cuSOLVER dispatch: - GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs) - GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs) - Solver chaining: auto x = d_A.llt().solve(d_B) - Solver expressions with .device(ctx) for explicit stream control. CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11, Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit. Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp, gpu_device_matrix.cpp, gpu_library_example.cu Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp, bench_gpu_batching.cpp	2026-04-09 19:05:25 -07:00
Rasmus Munk Larsen	6a9405bf7a	GPU: Raise CUDA/HIP minimum and remove legacy guards - Raise CUDA minimum from 9.0 to 11.4 (sm_70/Volta). - Raise HIP minimum to GFX906 (Vega 20/MI50) / ROCm 5.6. - Remove EIGEN_HAS_{CUDA,HIP,GPU}_FP16 guards — FP16 is always available on sm_70+ and GFX906+. - Remove obsolete __HIP_ARCH_HAS_* preprocessor branches. - C++14 cleanup: remove pre-C++14 workarounds in GPU code. - Fix NVCC warnings (deprecated register keyword, unreachable code, tautological comparisons). - Fix HIP test execution on gfx1151. - Update CI configuration for new minimum versions.	2026-04-09 15:21:39 -07:00