Revert "Revert "Speed up plog_double ~1.7x with fast integer range reduction""

This reverts commit b1d2ce4c85
2026-04-10 11:34:33 +08:00 · 2026-04-08 13:10:27 -07:00
57 changed files with 878 additions and 6215 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.17)
+cmake_minimum_required(VERSION 3.10.0)

 #==============================================================================
 # CMake Policy issues.
@@ -9,7 +9,7 @@ if (POLICY CMP0077)
 endif (POLICY CMP0077)

 # NOTE Remove setting the policy once the minimum required CMake version is
-# increased to at least 3.21. Retain enabling the export to package registry.
+# increased to at least 3.15. Retain enabling the export to package registry.
 if (POLICY CMP0090)
  # The export command does not populate package registry by default
  cmake_policy (SET CMP0090 NEW)
@@ -672,7 +672,7 @@ if (EIGEN_BUILD_TESTING)
  endif()

  set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.")
-  set(EIGEN_CUDA_COMPUTE_ARCH 70 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
+  set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")

  option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
  if(EIGEN_TEST_SYCL)
@@ -817,3 +817,4 @@ endif()
 message(STATUS "")
 message(STATUS "Configured Eigen ${EIGEN_VERSION_STRING}")
 message(STATUS "")
+
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -50,9 +50,9 @@
 #include "src/Core/util/AOCL_Support.h"


-// EIGEN_HAS_GPU_FP16 is now always true when compiling with CUDA or HIP.
-// Use EIGEN_GPUCC (compile-time) or EIGEN_GPU_COMPILE_PHASE (device phase) instead.
-// TODO: Remove EIGEN_HAS_GPU_BF16 similarly once HIP bf16 guards are cleaned up.
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+#define EIGEN_HAS_GPU_FP16
+#endif

 #if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16)
 #define EIGEN_HAS_GPU_BF16
--- a/Eigen/GPU
+++ b/Eigen/GPU
@@ -1,55 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_GPU_MODULE_H
-#define EIGEN_GPU_MODULE_H
-
-#include "Core"
-
-#include "src/Core/util/DisableStupidWarnings.h"
-
-/** \defgroup GPU_Module GPU module
- *
- * GPU-accelerated solvers and operations using NVIDIA CUDA libraries
- * (cuSOLVER, cuBLAS, cuSPARSE, cuFFT, cuDSS).
- *
- * This module provides explicit GPU solver classes that coexist with Eigen's
- * CPU solvers. Unlike the LAPACKE dispatch (which replaces the CPU
- * implementation globally), GPU classes are separate types the user
- * instantiates by choice:
- *
- * \code
- * #define EIGEN_USE_GPU
- * #include <Eigen/GPU>
- *
- * // CPU path (unchanged)
- * Eigen::LLT<Eigen::MatrixXd> llt_cpu(A);
- *
- * // GPU path (explicit)
- * Eigen::GpuLLT<double> llt_gpu(A);   // L stays on device
- * auto X = llt_gpu.solve(B);          // only B transferred per solve
- * \endcode
- *
- * Requires CUDA 11.4+. See CLAUDE.md.
- */
-
-#ifdef EIGEN_USE_GPU
-// IWYU pragma: begin_exports
-#include "src/GPU/DeviceMatrix.h"
-#include "src/GPU/GpuContext.h"
-#include "src/GPU/DeviceExpr.h"
-#include "src/GPU/DeviceBlasExpr.h"
-#include "src/GPU/DeviceSolverExpr.h"
-#include "src/GPU/DeviceDispatch.h"
-#include "src/GPU/GpuLLT.h"
-#include "src/GPU/GpuLU.h"
-// IWYU pragma: end_exports
-#endif
-
-#include "src/Core/util/ReenableStupidWarnings.h"
-
-#endif  // EIGEN_GPU_MODULE_H
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -858,8 +858,16 @@ struct hash<Eigen::bfloat16> {
 }  // namespace std
 #endif

-// Warp shuffle overloads for Eigen::bfloat16.
-// HIP uses non-sync __shfl variants; CUDA has native __nv_bfloat16 support in __shfl_sync.
+// Add the missing shfl* intrinsics.
+// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
+//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
+//
+// HIP and CUDA prior to SDK 9.0 define
+//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
+// CUDA since 9.0 deprecates those and instead defines
+//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
+//    with native support for __half and __nv_bfloat16
+//
 // Note that the following are __device__ - only functions.
 #if defined(EIGEN_HIPCC)

--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -141,158 +141,69 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Pac
  return plog_impl_float<Packet, /* base2 */ true>(_x);
 }

-// -----------------------------------------------------------------------
-// Double logarithm: shared polynomial + two range-reduction backends
-// -----------------------------------------------------------------------
-
-// Cephes rational-polynomial approximation of log(1+f) for
-// f in [sqrt(0.5)-1, sqrt(2)-1].
-// Evaluates x - 0.5*x^2 + x^3 * P(x)/Q(x) where P and Q are degree-5.
-// See: http://www.netlib.org/cephes/
-template <typename Packet>
-EIGEN_STRONG_INLINE Packet plog_mantissa_double(const Packet x) {
-  const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
-  const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
-  const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
-  const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
-  const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
-  const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
-  // Q0 = 1.0; pmadd(1, x, q1) simplifies to padd(x, q1).
-  const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
-  const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
-  const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
-  const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
-  const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
-
-  Packet x2 = pmul(x, x);
-  Packet x3 = pmul(x2, x);
-
-  // Evaluate P and Q simultaneously for better ILP.
-  Packet y, y1, y_;
-  y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
-  y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
-  y = pmadd(y, x, cst_cephes_log_p2);
-  y1 = pmadd(y1, x, cst_cephes_log_p5);
-  y_ = pmadd(y, x3, y1);
-
-  y = padd(x, cst_cephes_log_q1);
-  y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
-  y = pmadd(y, x, cst_cephes_log_q2);
-  y1 = pmadd(y1, x, cst_cephes_log_q5);
-  y = pmadd(y, x3, y1);
-
-  y_ = pmul(y_, x3);
-  y = pdiv(y_, y);
-  y = pnmadd(pset1<Packet>(0.5), x2, y);
-  return padd(x, y);
-}
-
-// Detect whether unpacket_traits<Packet>::integer_packet is defined.
-template <typename Packet, typename = void>
-struct packet_has_integer_packet : std::false_type {};
-template <typename Packet>
-struct packet_has_integer_packet<Packet, void_t<typename unpacket_traits<Packet>::integer_packet>> : std::true_type {};
-
-// Dispatch struct for double-precision range reduction.
-// Primary template: pfrexp-based fallback (used when integer_packet is absent).
-template <typename Packet, bool UseIntegerPacket>
-struct plog_range_reduce_double {
-  EIGEN_STRONG_INLINE static void run(const Packet v, Packet& f, Packet& e) {
-    const Packet one = pset1<Packet>(1.0);
-    const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
-    // pfrexp: f in [0.5, 1), e = unbiased exponent as double.
-    f = pfrexp(v, e);
-    // Shift [0.5,1) -> [sqrt(0.5)-1, sqrt(2)-1] with exponent correction:
-    //   if f < sqrt(0.5): f = f + f - 1, e -= 1   (giving f in [0, sqrt(2)-1))
-    //   else:             f = f - 1                (giving f in [sqrt(0.5)-1, 0))
-    Packet mask = pcmp_lt(f, cst_cephes_SQRTHF);
-    Packet tmp = pand(f, mask);
-    f = psub(f, one);
-    e = psub(e, pand(one, mask));
-    f = padd(f, tmp);
-  }
-};
-
-// Specialisation: fast integer-bit-manipulation path (musl-inspired).
-// Requires unpacket_traits<Packet>::integer_packet to be a 64-bit integer packet.
-template <typename Packet>
-struct plog_range_reduce_double<Packet, true> {
-  EIGEN_STRONG_INLINE static void run(const Packet v, Packet& f, Packet& e) {
-    typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-    // 2^-1022: smallest positive normal double.
-    const PacketI cst_min_normal = pset1<PacketI>(static_cast<int64_t>(0x0010000000000000LL));
-    // Lower 52-bit mask (IEEE mantissa field).
-    const PacketI cst_mant_mask = pset1<PacketI>(static_cast<int64_t>(0x000FFFFFFFFFFFFFLL));
-    // Offset = 1.0_bits - sqrt(0.5)_bits.  Adding this to the integer
-    // representation shifts the exponent field so that the [sqrt(0.5), sqrt(2))
-    // half-octave boundary falls on an exact biased-exponent boundary, letting
-    // us extract e with a single right shift.  The constant is:
-    //   0x3FF0000000000000 - 0x3FE6A09E667F3BCD = 0x00095F619980C433
-    const PacketI cst_sqrt_half_offset =
-        pset1<PacketI>(static_cast<int64_t>(0x3FF0000000000000LL - 0x3FE6A09E667F3BCDLL));
-    // IEEE double exponent bias (1023).
-    const PacketI cst_exp_bias = pset1<PacketI>(static_cast<int64_t>(1023));
-    // sqrt(0.5) IEEE bits — used to reconstruct f from biased mantissa.
-    const PacketI cst_half_mant = pset1<PacketI>(static_cast<int64_t>(0x3FE6A09E667F3BCDLL));
-
-    // Reinterpret v as a 64-bit integer vector.
-    PacketI vi = preinterpret<PacketI>(v);
-
-    // Normalise denormals: multiply by 2^52 and correct the exponent by -52.
-    PacketI is_denormal = pcmp_lt(vi, cst_min_normal);
-    // 2^52 via bit pattern: biased exponent = 52 + 1023 = 0x433, mantissa = 0.
-    Packet v_norm = pmul(v, pset1frombits<Packet>(static_cast<uint64_t>(int64_t(52 + 0x3ff) << 52)));
-    vi = pselect(is_denormal, preinterpret<PacketI>(v_norm), vi);
-    PacketI denorm_adj = pand(is_denormal, pset1<PacketI>(static_cast<int64_t>(52)));
-
-    // Bias the integer representation so the exponent field directly encodes
-    // the half-octave index.
-    PacketI vi_biased = padd(vi, cst_sqrt_half_offset);
-    // Extract unbiased exponent: shift out mantissa bits, subtract IEEE bias
-    // and denormal adjustment.
-    PacketI e_int = psub(psub(plogical_shift_right<52>(vi_biased), cst_exp_bias), denorm_adj);
-    // Convert integer exponent to floating-point.
-    e = pcast<PacketI, Packet>(e_int);
-
-    // Reconstruct mantissa in [sqrt(0.5), sqrt(2)) via integer arithmetic.
-    // The integer addition of the masked mantissa bits and the sqrt(0.5) bit
-    // pattern carries into the exponent field, yielding a value in that range.
-    // Then subtract 1 to centre on 0: f in [sqrt(0.5)-1, sqrt(2)-1].
-    f = psub(preinterpret<Packet>(padd(pand(vi_biased, cst_mant_mask), cst_half_mant)), pset1<Packet>(1.0));
-  }
-};
-
-// Core range reduction and polynomial for double logarithm.
-// Input:  v > 0 (zero / negative / inf / nan are handled by the caller).
-// Output: log_mantissa ≈ log(mantissa of v in [sqrt(0.5), sqrt(2))),
-//         e            = unbiased exponent of v as a double.
-// Selects the fast integer path when integer_packet is available, otherwise
-// falls back to pfrexp.
+// Core range reduction and polynomial evaluation for double logarithm.
+//
+// Same structure as plog_core_float but for double precision.
+// Given a positive double v (may be denormal), decomposes it as
+// v = 2^e * (1+f) with f in [sqrt(0.5)-1, sqrt(2)-1], then evaluates
+// log(1+f) ≈ f - 0.5*f^2 + f^3 * P(f)/Q(f) using the Cephes [5/5]
+// rational approximation.
 template <typename Packet>
 EIGEN_STRONG_INLINE void plog_core_double(const Packet v, Packet& log_mantissa, Packet& e) {
-  Packet f;
-  plog_range_reduce_double<Packet, packet_has_integer_packet<Packet>::value>::run(v, f, e);
-  log_mantissa = plog_mantissa_double(f);
+  typedef typename unpacket_traits<Packet>::integer_packet PacketL;
+
+  const PacketL cst_min_normal = pset1<PacketL>(int64_t(0x0010000000000000LL));
+  const PacketL cst_mant_mask = pset1<PacketL>(int64_t(0x000fffffffffffffLL));
+  const PacketL cst_sqrt_half_offset = pset1<PacketL>(int64_t(0x00095f619980c433LL));
+  const PacketL cst_exp_bias = pset1<PacketL>(int64_t(0x3ff));                  // 1023
+  const PacketL cst_half_mant = pset1<PacketL>(int64_t(0x3fe6a09e667f3bcdLL));  // sqrt(0.5)
+
+  // Normalize denormals by multiplying by 2^52.
+  PacketL vi = preinterpret<PacketL>(v);
+  PacketL is_denormal = pcmp_lt(vi, cst_min_normal);
+  Packet v_normalized = pmul(v, pset1<Packet>(4503599627370496.0));  // 2^52
+  vi = pselect(is_denormal, preinterpret<PacketL>(v_normalized), vi);
+  PacketL denorm_adj = pand(is_denormal, pset1<PacketL>(int64_t(52)));
+
+  // Combined range reduction via integer bias (same trick as float version).
+  PacketL vi_biased = padd(vi, cst_sqrt_half_offset);
+  PacketL e_int = psub(psub(plogical_shift_right<52>(vi_biased), cst_exp_bias), denorm_adj);
+  e = pcast<PacketL, Packet>(e_int);
+  Packet f = psub(preinterpret<Packet>(padd(pand(vi_biased, cst_mant_mask), cst_half_mant)), pset1<Packet>(1.0));
+
+  // Rational approximation log(1+f) = f - 0.5*f^2 + f^3 * P(f)/Q(f)
+  // from Cephes, [5/5] rational on [sqrt(0.5)-1, sqrt(2)-1].
+  Packet f2 = pmul(f, f);
+  Packet f3 = pmul(f2, f);
+
+  // Evaluate P and Q in factored form for instruction-level parallelism.
+  Packet y, y1, y_;
+  y = pmadd(pset1<Packet>(1.01875663804580931796E-4), f, pset1<Packet>(4.97494994976747001425E-1));
+  y1 = pmadd(pset1<Packet>(1.44989225341610930846E1), f, pset1<Packet>(1.79368678507819816313E1));
+  y = pmadd(y, f, pset1<Packet>(4.70579119878881725854E0));
+  y1 = pmadd(y1, f, pset1<Packet>(7.70838733755885391666E0));
+  y_ = pmadd(y, f3, y1);
+
+  y = pmadd(pset1<Packet>(1.0), f, pset1<Packet>(1.12873587189167450590E1));
+  y1 = pmadd(pset1<Packet>(8.29875266912776603211E1), f, pset1<Packet>(7.11544750618563894466E1));
+  y = pmadd(y, f, pset1<Packet>(4.52279145837532221105E1));
+  y1 = pmadd(y1, f, pset1<Packet>(2.31251620126765340583E1));
+  y = pmadd(y, f3, y1);
+
+  y_ = pmul(y_, f3);
+  y = pdiv(y_, y);
+
+  y = pmadd(pset1<Packet>(-0.5), f2, y);
+  log_mantissa = padd(f, y);
 }

-/* Returns the base e (2.718...) or base 2 logarithm of x.
- * The argument is separated into its exponent and fractional parts.
- * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
- * is approximated by
- *
- *     log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
- *
- * for more detail see: http://www.netlib.org/cephes/
- */
+// Natural or base-2 logarithm for double packets.
 template <typename Packet, bool base2>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) {
-  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
-  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
-
  Packet log_mantissa, e;
  plog_core_double(_x, log_mantissa, e);

-  // Combine: log(x) = e * ln2 + log(mantissa), or log2(x) = log(mantissa)*log2e + e.
+  // Add the logarithm of the exponent back to the result.
  Packet x;
  if (base2) {
    const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
@@ -302,13 +213,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(cons
    x = pmadd(e, cst_ln2, log_mantissa);
  }

+  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
+  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
  Packet iszero_mask = pcmp_eq(_x, pzero(_x));
  Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
-  // Filter out invalid inputs:
-  //  - negative arg → NAN
-  //  - 0            → -INF
-  //  - +INF         → +INF
  return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
 }

@@ -362,11 +271,11 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_float(c
  return result;
 }

-/** \internal \returns log(1 + x) for double precision.
-    Computes log(1+x) using plog_core_double for the core range reduction and
-    polynomial evaluation.  The rounding error from forming u = fl(1+x) is
-    recovered as dx = x - (u - 1) and folded in as a first-order correction
-    dx/u after the polynomial evaluation.
+/** \internal \returns log(1 + x) for double precision float.
+    Computes log(1+x) using plog_core_double for the core range reduction
+    and polynomial evaluation. The rounding error from forming u = fl(1+x)
+    is recovered as dx = x - (u - 1), and folded in as a first-order
+    correction dx/u after the polynomial evaluation.
 */
 template <typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(const Packet& x) {
@@ -374,7 +283,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(
  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));

-  // u = 1 + x, with rounding.  Recover the lost low bits: dx = x - (u - 1).
+  // u = 1 + x, with rounding. Recover the lost low bits: dx = x - (u - 1).
  Packet u = padd(one, x);
  Packet dx = psub(x, psub(u, one));

@@ -398,7 +307,7 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p_double(
  result = pselect(small_mask, x, result);
  result = pselect(inf_mask, cst_pos_inf, result);
  result = pselect(zero_mask, cst_minus_inf, result);
-  result = por(neg_mask, result);  // NaN for x < -1
+  result = por(neg_mask, result);
  return result;
 }

--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -45,7 +45,7 @@
 // Eigen with GPU support.
 // Any functions that require `numext::bit_cast` may also not be constexpr,
 // including any native types when setting via raw bit values.
-#if defined(EIGEN_GPUCC) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
 #define _EIGEN_MAYBE_CONSTEXPR
 #else
 #define _EIGEN_MAYBE_CONSTEXPR constexpr
@@ -121,12 +121,12 @@ namespace half_impl {
 //
 // Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
 // this error, and hence the following convoluted #if condition
-#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)
+#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)

 // Make our own __half_raw definition that is similar to CUDA's.
 struct __half_raw {
  struct construct_from_rep_tag {};
-#if (defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE))
+#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
  // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
  // The element type for shared memory cannot have non-trivial constructors
  // and hence the following special casing (which skips the zero-initilization).
@@ -152,12 +152,16 @@ struct __half_raw {
 #endif
 };

-#elif defined(EIGEN_HIPCC)
+#elif defined(EIGEN_HAS_HIP_FP16)
 // HIP GPU compile phase: nothing to do here.
 // HIP fp16 header file has a definition for __half_raw
-#elif defined(EIGEN_CUDACC)
+#elif defined(EIGEN_HAS_CUDA_FP16)

 // CUDA GPU compile phase.
+#if EIGEN_CUDA_SDK_VER < 90000
+// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
+typedef __half __half_raw;
+#endif  // defined(EIGEN_HAS_CUDA_FP16)

 #elif defined(SYCL_DEVICE_ONLY)
 typedef cl::sycl::half __half_raw;
@@ -171,13 +175,15 @@ struct half_base : public __half_raw {
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {}
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}

-#if defined(EIGEN_GPUCC)
-#if defined(EIGEN_HIPCC)
+#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_HAS_HIP_FP16)
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
-#elif defined(EIGEN_CUDACC)
+#elif defined(EIGEN_HAS_CUDA_FP16)
+#if EIGEN_CUDA_SDK_VER >= 90000
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
 #endif
 #endif
+#endif
 };

 }  // namespace half_impl
@@ -186,29 +192,36 @@ struct half_base : public __half_raw {
 struct half : public half_impl::half_base {
  // Writing this out as separate #if-else blocks to make the code easier to follow
  // The same applies to most #if-else blocks in this file
-#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)
+#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
  // Use the same base class for the following two scenarios
  // * when compiling without GPU support enabled
  // * during host compile phase when compiling with GPU support enabled
  typedef half_impl::__half_raw __half_raw;
-#elif defined(EIGEN_HIPCC)
+#elif defined(EIGEN_HAS_HIP_FP16)
  // Nothing to do here
  // HIP fp16 header file has a definition for __half_raw
-#elif defined(EIGEN_CUDACC)
-  // Nothing to do here.
+#elif defined(EIGEN_HAS_CUDA_FP16)
+// Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
+// (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP!  So keeping this within
+// #if defined(EIGEN_HAS_CUDA_FP16) is needed
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+  typedef half_impl::__half_raw __half_raw;
+#endif
 #endif

  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {}

  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}

-#if defined(EIGEN_GPUCC)
-#if defined(EIGEN_HIPCC)
+#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_HAS_HIP_FP16)
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
-#elif defined(EIGEN_CUDACC)
+#elif defined(EIGEN_HAS_CUDA_FP16)
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
 #endif
 #endif
+#endif

 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
  explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b)
@@ -235,7 +248,7 @@ struct half : public half_impl::half_base {
    return half_impl::half_to_float(*this);
  }

-#if defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)
  EIGEN_DEVICE_FUNC operator __half() const {
    ::__half_raw hr;
    hr.x = x;
@@ -367,7 +380,8 @@ namespace Eigen {

 namespace half_impl {

-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
 // Note: We deliberately do *not* define this to 1 even if we have Arm's native
 // fp16 type since GPU half types are rather different from native CPU half types.
 #define EIGEN_HAS_NATIVE_GPU_FP16
@@ -379,10 +393,24 @@ namespace half_impl {
 // conversion steps back and forth.

 #if defined(EIGEN_HAS_NATIVE_GPU_FP16)
-EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) { return __hadd(::__half(a), ::__half(b)); }
+EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+  return __hadd(::__half(a), ::__half(b));
+#else
+  return __hadd(a, b);
+#endif
+}
 EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { return __hmul(a, b); }
 EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { return __hsub(a, b); }
-EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) { return __hdiv(a, b); }
+EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+  return __hdiv(a, b);
+#else
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+#endif
+}
 EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { return __hneg(a); }
 EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
  a = a + b;
@@ -477,7 +505,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half&
 // We need to provide emulated *host-side* FP16 operators for clang.
 #pragma push_macro("EIGEN_DEVICE_FUNC")
 #undef EIGEN_DEVICE_FUNC
-#if defined(EIGEN_CUDACC) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
 #define EIGEN_DEVICE_FUNC __host__
 #else  // both host and device need emulated ops.
 #define EIGEN_DEVICE_FUNC __host__ __device__
@@ -608,7 +636,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint
  // because this is constexpr function.
  // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out
  // of this catch22 by having separate bodies for GPU / non GPU
-#if defined(EIGEN_GPUCC)
+#if defined(EIGEN_HAS_GPU_FP16)
  __half_raw h;
  h.x = x;
  return h;
@@ -633,7 +661,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const
 }

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
  __half tmp_ff = __float2half(ff);
  return *(__half_raw*)&tmp_ff;

@@ -706,7 +735,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
 }

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
  return __half2float(h);
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
  return static_cast<float>(h.x);
@@ -748,7 +778,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
  return __hisnan(a);
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
  return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
@@ -779,14 +810,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+    defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hexp(a));
 #else
  return half(::expf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
-#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+    defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hexp2(a));
 #else
  return half(::exp2f(float(a)));
@@ -794,7 +827,9 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
+     EIGEN_CUDA_ARCH >= 530) ||                                                                 \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
  return half(hlog(a));
 #else
  return half(::logf(float(a)));
@@ -807,7 +842,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) {
 }

 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+    defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hsqrt(a));
 #else
  return half(::sqrtf(float(a)));
@@ -828,14 +864,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { return half(::a
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) { return half(::atanf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) { return half(::atanhf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+    defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hfloor(a));
 #else
  return half(::floorf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+    defined(EIGEN_HIP_DEVICE_COMPILE)
  return half(hceil(a));
 #else
  return half(::ceilf(float(a)));
@@ -969,12 +1007,20 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half madd<Eigen::half>(const Eigen:
 }  // namespace numext
 }  // namespace Eigen

-// Warp shuffle overloads for Eigen::half.
-// CUDA uses __shfl_*_sync (with mask); HIP uses __shfl_* (no mask).
+// Add the missing shfl* intrinsics.
+// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
+//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
+//
+// HIP and CUDA prior to SDK 9.0 define
+//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
+// CUDA since 9.0 deprecates those and instead defines
+//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
+//    with native support for __half and __nv_bfloat16
+//
 // Note that the following are __device__ - only functions.
-#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) || defined(EIGEN_HIPCC)

-#if defined(EIGEN_CUDACC)
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000

 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
                                                       int width = warpSize) {
@@ -1000,7 +1046,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen:
  return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
 }

-#else  // HIP
+#else  // HIP or CUDA SDK < 9.0

 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
@@ -1026,7 +1072,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
 #endif  // __shfl*

 // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) || defined(EIGEN_HIPCC)
 EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
  return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
 }
@@ -1049,7 +1095,8 @@ namespace internal {
 template <>
 struct cast_impl<float, half> {
  EIGEN_DEVICE_FUNC static inline half run(const float& a) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
    return __float2half(a);
 #else
    return half(a);
@@ -1060,7 +1107,8 @@ struct cast_impl<float, half> {
 template <>
 struct cast_impl<int, half> {
  EIGEN_DEVICE_FUNC static inline half run(const int& a) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
    return __float2half(static_cast<float>(a));
 #else
    return half(static_cast<float>(a));
@@ -1071,7 +1119,8 @@ struct cast_impl<int, half> {
 template <>
 struct cast_impl<half, float> {
  EIGEN_DEVICE_FUNC static inline float run(const half& a) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
    return __half2float(a);
 #else
    return static_cast<float>(a);
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -17,8 +17,19 @@ namespace Eigen {

 namespace internal {

-// Read-only data cached load (__ldg) and native FP16 arithmetic are available
-// on all supported GPU architectures (sm_70+ for CUDA, GFX906+ for HIP).
+// Read-only data cached load available.
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
+#define EIGEN_GPU_HAS_LDG 1
+#endif
+
+// FP16 math available.
+#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
+#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
+#endif
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
+#endif

 // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
@@ -45,84 +56,92 @@ struct is_arithmetic<double2> {

 template <>
 struct packet_traits<float> : default_packet_traits {
-  using type = float4;
-  using half = float4;
-  static constexpr int Vectorizable = 1;
-  static constexpr int AlignedOnScalar = 1;
-  static constexpr int size = 4;
+  typedef float4 type;
+  typedef float4 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,

-  static constexpr int HasDiv = 1;
-  static constexpr int HasSin = 0;
-  static constexpr int HasCos = 0;
-  static constexpr int HasLog = 1;
-  static constexpr int HasExp = 1;
-  static constexpr int HasSqrt = 1;
-  static constexpr int HasRsqrt = 1;
-  static constexpr int HasLGamma = 1;
-  static constexpr int HasDiGamma = 1;
-  static constexpr int HasZeta = 1;
-  static constexpr int HasPolygamma = 1;
-  static constexpr int HasErf = 1;
-  static constexpr int HasErfc = 1;
-  static constexpr int HasNdtri = 1;
-  static constexpr int HasBessel = 1;
-  static constexpr int HasIGamma = 1;
-  static constexpr int HasIGammaDerA = 1;
-  static constexpr int HasGammaSampleDerAlpha = 1;
-  static constexpr int HasIGammac = 1;
-  static constexpr int HasBetaInc = 1;
+    HasDiv = 1,
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+    HasIGamma = 1,
+    HasIGammaDerA = 1,
+    HasGammaSampleDerAlpha = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,

-  static constexpr int HasFloor = 1;
-  static constexpr int HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS;
+    HasFloor = 1,
+    HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
+  };
 };

 template <>
 struct packet_traits<double> : default_packet_traits {
-  using type = double2;
-  using half = double2;
-  static constexpr int Vectorizable = 1;
-  static constexpr int AlignedOnScalar = 1;
-  static constexpr int size = 2;
+  typedef double2 type;
+  typedef double2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,

-  static constexpr int HasDiv = 1;
-  static constexpr int HasLog = 1;
-  static constexpr int HasExp = 1;
-  static constexpr int HasSqrt = 1;
-  static constexpr int HasRsqrt = 1;
-  static constexpr int HasLGamma = 1;
-  static constexpr int HasDiGamma = 1;
-  static constexpr int HasZeta = 1;
-  static constexpr int HasPolygamma = 1;
-  static constexpr int HasErf = 1;
-  static constexpr int HasErfc = 1;
-  static constexpr int HasNdtri = 1;
-  static constexpr int HasBessel = 1;
-  static constexpr int HasIGamma = 1;
-  static constexpr int HasIGammaDerA = 1;
-  static constexpr int HasGammaSampleDerAlpha = 1;
-  static constexpr int HasIGammac = 1;
-  static constexpr int HasBetaInc = 1;
+    HasDiv = 1,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+    HasIGamma = 1,
+    HasIGammaDerA = 1,
+    HasGammaSampleDerAlpha = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+  };
 };

 template <>
 struct unpacket_traits<float4> {
-  using type = float;
-  static constexpr int size = 4;
-  static constexpr int alignment = Aligned16;
-  static constexpr bool vectorizable = true;
-  static constexpr bool masked_load_available = false;
-  static constexpr bool masked_store_available = false;
-  using half = float4;
+  typedef float type;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef float4 half;
 };
 template <>
 struct unpacket_traits<double2> {
-  using type = double;
-  static constexpr int size = 2;
-  static constexpr int alignment = Aligned16;
-  static constexpr bool vectorizable = true;
-  static constexpr bool masked_load_available = false;
-  static constexpr bool masked_store_available = false;
-  using half = double2;
+  typedef double type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef double2 half;
 };

 template <>
@@ -384,7 +403,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const dou

 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_GPU_HAS_LDG)
  return __ldg(reinterpret_cast<const float4*>(from));
 #else
  return make_float4(from[0], from[1], from[2], from[3]);
@@ -392,7 +411,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_GPU_HAS_LDG)
  return __ldg(reinterpret_cast<const double2*>(from));
 #else
  return make_double2(from[0], from[1]);
@@ -401,7 +420,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const

 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_GPU_HAS_LDG)
  return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
 #else
  return make_float4(from[0], from[1], from[2], from[3]);
@@ -409,7 +428,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_GPU_HAS_LDG)
  return make_double2(__ldg(from + 0), __ldg(from + 1));
 #else
  return make_double2(from[0], from[1]);
@@ -572,20 +591,23 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {

 #endif  // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)

-// Half-packet functions are only available in GPU device compilation — they use
-// intrinsics (__half2, etc.) that have no host-side benefit.
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
+// on device. There is no benefit to using them on the host anyways, since they are
+// emulated.
+#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)

-using Packet4h2 = ulonglong2;
+typedef ulonglong2 Packet4h2;
 template <>
 struct unpacket_traits<Packet4h2> {
-  using type = Eigen::half;
-  static constexpr int size = 8;
-  static constexpr int alignment = Aligned16;
-  static constexpr bool vectorizable = true;
-  static constexpr bool masked_load_available = false;
-  static constexpr bool masked_store_available = false;
-  using half = Packet4h2;
+  typedef Eigen::half type;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet4h2 half;
 };
 template <>
 struct is_arithmetic<Packet4h2> {
@@ -594,13 +616,15 @@ struct is_arithmetic<Packet4h2> {

 template <>
 struct unpacket_traits<half2> {
-  using type = Eigen::half;
-  static constexpr int size = 2;
-  static constexpr int alignment = Aligned16;
-  static constexpr bool vectorizable = true;
-  static constexpr bool masked_load_available = false;
-  static constexpr bool masked_store_available = false;
-  using half = half2;
+  typedef Eigen::half type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef half2 half;
 };
 template <>
 struct is_arithmetic<half2> {
@@ -609,21 +633,23 @@ struct is_arithmetic<half2> {

 template <>
 struct packet_traits<Eigen::half> : default_packet_traits {
-  using type = Packet4h2;
-  using half = Packet4h2;
-  static constexpr int Vectorizable = 1;
-  static constexpr int AlignedOnScalar = 1;
-  static constexpr int size = 8;
-  static constexpr int HasAdd = 1;
-  static constexpr int HasSub = 1;
-  static constexpr int HasMul = 1;
-  static constexpr int HasDiv = 1;
-  static constexpr int HasSqrt = 1;
-  static constexpr int HasRsqrt = 1;
-  static constexpr int HasExp = 1;
-  static constexpr int HasExpm1 = 1;
-  static constexpr int HasLog = 1;
-  static constexpr int HasLog1p = 1;
+  typedef Packet4h2 type;
+  typedef Packet4h2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasExp = 1,
+    HasExpm1 = 1,
+    HasLog = 1,
+    HasLog1p = 1
+  };
 };

 template <>
@@ -664,7 +690,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2&
 }

 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_GPU_HAS_LDG)
  // Input is guaranteed to be properly aligned.
  return __ldg(reinterpret_cast<const half2*>(from));
 #else
@@ -673,7 +699,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half*
 }

 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_GPU_HAS_LDG)
  return __halves2half2(__ldg(from + 0), __ldg(from + 1));
 #else
  return __halves2half2(*(from + 0), *(from + 1));
@@ -719,7 +745,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& ker
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+#else
+  float f = __half2float(a) + 1.0f;
+  return __halves2half2(a, __float2half(f));
+#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
@@ -806,21 +837,89 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2&
  return __halves2half2(result1, result2);
 }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { return __hadd2(a, b); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { return __hsub2(a, b); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hsub2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 - b1;
+  float r2 = a2 - b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { return __hneg2(a); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hneg2(a);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return __floats2half2_rn(-a1, -a2);
+#endif
+}

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { return __hmul2(a, b); }
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
-  return __hfma2(a, b, c);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
 }

-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { return __h2div(a, b); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hfma2(a, b, c);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float c1 = __low2float(c);
+  float c2 = __high2float(c);
+  float r1 = a1 * b1 + c1;
+  float r2 = a2 * b2 + c2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __h2div(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
  float a1 = __low2float(a);
@@ -843,23 +942,47 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b)
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __hadd(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(__float2half(a1 + a2));
+#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  __half first = __low2half(a);
  __half second = __high2half(a);
  return __hgt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 > a2 ? __low2half(a) : __high2half(a);
+#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  __half first = __low2half(a);
  __half second = __high2half(a);
  return __hlt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 < a2 ? __low2half(a) : __high2half(a);
+#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __hmul(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(__float2half(a1 * a2));
+#endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
@@ -878,6 +1001,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
  return __floats2half2_rn(r1, r2);
 }

+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
+
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
@@ -885,6 +1010,41 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
+
+#else
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = logf(a1);
+  float r2 = logf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = expf(a1);
+  float r2 = expf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = sqrtf(a1);
+  float r2 = sqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = rsqrtf(a1);
+  float r2 = rsqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+#endif
 }  // namespace

 template <>
@@ -931,17 +1091,19 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to,

 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
  Packet4h2 r;
-#if defined(EIGEN_GPU_COMPILE_PHASE)
  r = __ldg(reinterpret_cast<const Packet4h2*>(from));
+  return r;
 #else
+  Packet4h2 r;
  half2* r_alias = reinterpret_cast<half2*>(&r);
  r_alias[0] = ploadt_ro_aligned(from + 0);
  r_alias[1] = ploadt_ro_aligned(from + 2);
  r_alias[2] = ploadt_ro_aligned(from + 4);
  r_alias[3] = ploadt_ro_aligned(from + 6);
-#endif
  return r;
+#endif
 }

 template <>
@@ -1110,7 +1272,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::ha
  p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
  p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
  return r;
-#elif defined(EIGEN_CUDA_ARCH)
+#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
  Packet4h2 r;
  half2* r_alias = reinterpret_cast<half2*>(&r);

@@ -1128,6 +1290,16 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::ha
  r_alias[3] = plset(__high2half(c));

  return r;
+
+#else
+  float f = __half2float(a);
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
+  p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
+  p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
+  p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
+  return r;
 #endif
 }

@@ -1361,7 +1533,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Pa
  half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
  __half first = predux_max(m0);
  __half second = predux_max(m1);
-#if defined(EIGEN_CUDA_ARCH)
+#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
  return (__hgt(first, second) ? first : second);
 #else
  float ffirst = __half2float(first);
@@ -1377,7 +1549,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Pa
  half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
  __half first = predux_min(m0);
  __half second = predux_min(m1);
-#if defined(EIGEN_CUDA_ARCH)
+#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
  return (__hlt(first, second) ? first : second);
 #else
  float ffirst = __half2float(first);
@@ -1469,17 +1641,47 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h
 // the implementation of GPU half reduction.
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
 }

 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
 }

 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
  return __h2div(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+#endif
 }

 template <>
@@ -1504,7 +1706,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const ha
  return __halves2half2(r1, r2);
 }

-#endif  // defined(EIGEN_GPU_COMPILE_PHASE)
+#endif  // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
+
+#undef EIGEN_GPU_HAS_LDG
+#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
+#undef EIGEN_GPU_HAS_FP16_ARITHMETIC

 }  // end namespace internal

--- a/Eigen/src/Core/arch/GPU/TypeCasting.h
+++ b/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -17,7 +17,8 @@ namespace Eigen {

 namespace internal {

-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))

 template <>
 struct type_casting_traits<Eigen::half, float> {
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -541,6 +541,12 @@ extern "C" {
 #if defined EIGEN_CUDACC
 #define EIGEN_VECTORIZE_GPU
 #include <vector_types.h>
+#if EIGEN_CUDA_SDK_VER >= 70500
+#define EIGEN_HAS_CUDA_FP16
+#endif
+#endif
+
+#if defined(EIGEN_HAS_CUDA_FP16)
 #include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
 #endif
@@ -548,6 +554,7 @@ extern "C" {
 #if defined(EIGEN_HIPCC)
 #define EIGEN_VECTORIZE_GPU
 #include <hip/hip_vector_types.h>
+#define EIGEN_HAS_HIP_FP16
 #include <hip/hip_fp16.h>
 #define EIGEN_HAS_HIP_BF16
 #include <hip/hip_bfloat16.h>
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -84,7 +84,8 @@
 #endif

 #if defined __NVCC__ && defined __CUDACC__
-// MSVC does not support the _Pragma keyword, so we use Microsoft's __pragma extension.
+// MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so
+// we instead use Microsoft's __pragma extension.
 #if defined _MSC_VER
 #define EIGEN_MAKE_PRAGMA(X) __pragma(#X)
 #else
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -148,8 +148,13 @@
 #endif

 #if defined(__NVCC__)
-// CUDA 11.4+ always defines __CUDACC_VER_MAJOR__.
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
 #define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
+#elif defined(__CUDACC_VER__)
+#define EIGEN_COMP_NVCC __CUDACC_VER__
+#else
+#error "NVCC did not define compiler version."
+#endif
 #else
 #define EIGEN_COMP_NVCC 0
 #endif
@@ -570,10 +575,6 @@
 #define EIGEN_CUDA_SDK_VER 0
 #endif

-#if defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER > 0 && EIGEN_CUDA_SDK_VER < 110400
-#error "Eigen requires CUDA 11.4 or later."
-#endif
-
 #if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) && !defined(__SYCL_DEVICE_ONLY__)
 // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
 #define EIGEN_HIPCC __HIPCC__
@@ -583,20 +584,22 @@
 // ++ host_defines.h which contains the defines for the __host__ and __device__ macros
 #include <hip/hip_runtime.h>

-// Eigen requires ROCm/HIP >= 5.6 (GFX906 minimum architecture).
-// This floor exists to allow simplifying shared CUDA/HIP preprocessor guards —
-// all __HIP_ARCH_HAS_WARP_SHUFFLE__, __HIP_ARCH_HAS_FP16__, etc. are always true on GFX906+.
-#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 5 || (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 6))
-#error "Eigen requires ROCm/HIP >= 5.6."
-#endif
-
 #if defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
 // analogous to EIGEN_CUDA_ARCH, but for HIP
 #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
 #endif

-// HIP compilers default to launch_bounds(256), which causes failures when kernels
-// are called with more than 256 threads per block. Explicitly set to 1024 for HIP.
+// For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
+// value to 1024. The compiler assigns a default value of 256 when the attribute is not
+// specified. This results in failures on the HIP platform, for cases when a GPU kernel
+// without an explicit launch_bounds attribute is called with a threads_per_block value
+// greater than 256.
+//
+// This is a regression in functionality and is expected to be fixed within the next
+// couple of ROCm releases (compiler will go back to using 1024 value as the default)
+//
+// In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
+// attribute.

 #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)

--- a/Eigen/src/GPU/CuBlasSupport.h
+++ b/Eigen/src/GPU/CuBlasSupport.h
@@ -1,160 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// cuBLAS-specific support types:
-//   - Error-checking macro
-//   - Operation enum and mapping to cublasOperation_t
-//
-// Generic CUDA runtime utilities (DeviceBuffer, cuda_data_type) are in GpuSupport.h.
-
-#ifndef EIGEN_GPU_CUBLAS_SUPPORT_H
-#define EIGEN_GPU_CUBLAS_SUPPORT_H
-
-// IWYU pragma: private
-#include "./InternalHeaderCheck.h"
-
-#include "./GpuSupport.h"
-#include <cublas_v2.h>
-
-namespace Eigen {
-namespace internal {
-
-// ---- Error-checking macro ---------------------------------------------------
-
-#define EIGEN_CUBLAS_CHECK(expr)                                       \
-  do {                                                                 \
-    cublasStatus_t _s = (expr);                                        \
-    eigen_assert(_s == CUBLAS_STATUS_SUCCESS && "cuBLAS call failed"); \
-  } while (0)
-
-// ---- Operation enum ---------------------------------------------------------
-// Maps transpose/adjoint flags to cublasOperation_t.
-
-enum class GpuOp { NoTrans, Trans, ConjTrans };
-
-constexpr cublasOperation_t to_cublas_op(GpuOp op) {
-  switch (op) {
-    case GpuOp::Trans:
-      return CUBLAS_OP_T;
-    case GpuOp::ConjTrans:
-      return CUBLAS_OP_C;
-    default:
-      return CUBLAS_OP_N;
-  }
-}
-
-// ---- Scalar → cublasComputeType_t -------------------------------------------
-// cublasGemmEx requires a compute type (separate from the data type).
-
-template <typename Scalar>
-struct cuda_compute_type;
-
-template <>
-struct cuda_compute_type<float> {
-  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
-};
-template <>
-struct cuda_compute_type<double> {
-  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
-};
-template <>
-struct cuda_compute_type<std::complex<float>> {
-  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
-};
-template <>
-struct cuda_compute_type<std::complex<double>> {
-  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
-};
-
-// ---- Type-specific cuBLAS wrappers ------------------------------------------
-// cuBLAS uses separate functions per type (Strsm, Dtrsm, etc.).
-// These overloaded wrappers allow calling cublasXtrsm/cublasXsymm/cublasXsyrk
-// with any supported scalar type.
-
-// TRSM wrappers
-inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
-                                  cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha,
-                                  const float* A, int lda, float* B, int ldb) {
-  return cublasStrsm(h, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
-                                  cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha,
-                                  const double* A, int lda, double* B, int ldb) {
-  return cublasDtrsm(h, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
-}
-inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
-                                  cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-                                  const std::complex<float>* alpha, const std::complex<float>* A, int lda,
-                                  std::complex<float>* B, int ldb) {
-  return cublasCtrsm(h, side, uplo, trans, diag, m, n, reinterpret_cast<const cuComplex*>(alpha),
-                     reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(B), ldb);
-}
-inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
-                                  cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
-                                  const std::complex<double>* alpha, const std::complex<double>* A, int lda,
-                                  std::complex<double>* B, int ldb) {
-  return cublasZtrsm(h, side, uplo, trans, diag, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
-                     reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<cuDoubleComplex*>(B), ldb);
-}
-
-// SYMM wrappers (real → symm, complex → hemm)
-inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
-                                  const float* alpha, const float* A, int lda, const float* B, int ldb,
-                                  const float* beta, float* C, int ldc) {
-  return cublasSsymm(h, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
-                                  const double* alpha, const double* A, int lda, const double* B, int ldb,
-                                  const double* beta, double* C, int ldc) {
-  return cublasDsymm(h, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
-                                  const std::complex<float>* alpha, const std::complex<float>* A, int lda,
-                                  const std::complex<float>* B, int ldb, const std::complex<float>* beta,
-                                  std::complex<float>* C, int ldc) {
-  return cublasChemm(h, side, uplo, m, n, reinterpret_cast<const cuComplex*>(alpha),
-                     reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<const cuComplex*>(B), ldb,
-                     reinterpret_cast<const cuComplex*>(beta), reinterpret_cast<cuComplex*>(C), ldc);
-}
-inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
-                                  const std::complex<double>* alpha, const std::complex<double>* A, int lda,
-                                  const std::complex<double>* B, int ldb, const std::complex<double>* beta,
-                                  std::complex<double>* C, int ldc) {
-  return cublasZhemm(h, side, uplo, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
-                     reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<const cuDoubleComplex*>(B), ldb,
-                     reinterpret_cast<const cuDoubleComplex*>(beta), reinterpret_cast<cuDoubleComplex*>(C), ldc);
-}
-
-// SYRK wrappers (real → syrk, complex → herk)
-inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
-                                  const float* alpha, const float* A, int lda, const float* beta, float* C, int ldc) {
-  return cublasSsyrk(h, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
-                                  const double* alpha, const double* A, int lda, const double* beta, double* C,
-                                  int ldc) {
-  return cublasDsyrk(h, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
-}
-inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
-                                  const float* alpha, const std::complex<float>* A, int lda, const float* beta,
-                                  std::complex<float>* C, int ldc) {
-  return cublasCherk(h, uplo, trans, n, k, alpha, reinterpret_cast<const cuComplex*>(A), lda, beta,
-                     reinterpret_cast<cuComplex*>(C), ldc);
-}
-inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
-                                  const double* alpha, const std::complex<double>* A, int lda, const double* beta,
-                                  std::complex<double>* C, int ldc) {
-  return cublasZherk(h, uplo, trans, n, k, alpha, reinterpret_cast<const cuDoubleComplex*>(A), lda, beta,
-                     reinterpret_cast<cuDoubleComplex*>(C), ldc);
-}
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // EIGEN_GPU_CUBLAS_SUPPORT_H
--- a/Eigen/src/GPU/CuSolverSupport.h
+++ b/Eigen/src/GPU/CuSolverSupport.h
@@ -1,97 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// cuSOLVER-specific support types:
-//   - cuSOLVER error-checking macro
-//   - RAII wrapper for cusolverDnParams
-//   - Scalar → cudaDataType_t mapping
-//   - (UpLo, StorageOrder) → cublasFillMode_t mapping
-//
-// Generic CUDA runtime utilities (DeviceBuffer, EIGEN_CUDA_RUNTIME_CHECK)
-// are in GpuSupport.h.
-
-#ifndef EIGEN_GPU_CUSOLVER_SUPPORT_H
-#define EIGEN_GPU_CUSOLVER_SUPPORT_H
-
-// IWYU pragma: private
-#include "./InternalHeaderCheck.h"
-
-#include "./GpuSupport.h"
-#include <cusolverDn.h>
-
-namespace Eigen {
-namespace internal {
-
-// ---- Error-checking macros --------------------------------------------------
-
-#define EIGEN_CUSOLVER_CHECK(expr)                                         \
-  do {                                                                     \
-    cusolverStatus_t _s = (expr);                                          \
-    eigen_assert(_s == CUSOLVER_STATUS_SUCCESS && "cuSOLVER call failed"); \
-  } while (0)
-
-// ---- RAII: cusolverDnParams -------------------------------------------------
-
-struct CusolverParams {
-  cusolverDnParams_t p = nullptr;
-
-  CusolverParams() { EIGEN_CUSOLVER_CHECK(cusolverDnCreateParams(&p)); }
-
-  ~CusolverParams() {
-    if (p) (void)cusolverDnDestroyParams(p);  // destructor: can't propagate
-  }
-
-  // Move-only.
-  CusolverParams(CusolverParams&& o) noexcept : p(o.p) { o.p = nullptr; }
-  CusolverParams& operator=(CusolverParams&& o) noexcept {
-    if (this != &o) {
-      if (p) (void)cusolverDnDestroyParams(p);
-      p = o.p;
-      o.p = nullptr;
-    }
-    return *this;
-  }
-
-  CusolverParams(const CusolverParams&) = delete;
-  CusolverParams& operator=(const CusolverParams&) = delete;
-};
-
-// ---- Scalar → cudaDataType_t ------------------------------------------------
-// Alias for backward compatibility. The canonical trait is cuda_data_type<> in GpuSupport.h.
-template <typename Scalar>
-using cusolver_data_type = cuda_data_type<Scalar>;
-
-// ---- (UpLo, StorageOrder) → cublasFillMode_t --------------------------------
-// cuSOLVER always interprets the matrix as column-major. A row-major matrix A
-// appears as A^T to cuSOLVER, so the upper/lower triangle is swapped.
-
-template <int UpLo, int StorageOrder>
-struct cusolver_fill_mode;
-
-template <>
-struct cusolver_fill_mode<Lower, ColMajor> {
-  static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
-};
-template <>
-struct cusolver_fill_mode<Upper, ColMajor> {
-  static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_UPPER;
-};
-template <>
-struct cusolver_fill_mode<Lower, RowMajor> {
-  static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_UPPER;
-};
-template <>
-struct cusolver_fill_mode<Upper, RowMajor> {
-  static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
-};
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // EIGEN_GPU_CUSOLVER_SUPPORT_H
--- a/Eigen/src/GPU/DeviceBlasExpr.h
+++ b/Eigen/src/GPU/DeviceBlasExpr.h
@@ -1,146 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// BLAS Level 3 expression types for DeviceMatrix (beyond GEMM):
-//   TrsmExpr           → cublasXtrsm   (triangular solve)
-//   SymmExpr           → cublasXsymm   (symmetric multiply, real)
-//                      → cublasXhemm   (Hermitian multiply, complex)
-//   SyrkExpr           → cublasXsyrk   (symmetric rank-k update, real)
-//                      → cublasXherk   (Hermitian rank-k update, complex)
-
-#ifndef EIGEN_GPU_DEVICE_BLAS_EXPR_H
-#define EIGEN_GPU_DEVICE_BLAS_EXPR_H
-
-// IWYU pragma: private
-#include "./InternalHeaderCheck.h"
-
-namespace Eigen {
-
-template <typename Scalar_>
-class DeviceMatrix;
-
-// ---- DeviceTriangularView ---------------------------------------------------
-// d_A.triangularView<Lower>() → view with .solve(d_B)
-
-template <typename Scalar_, int UpLo_>
-class DeviceTriangularView {
- public:
-  using Scalar = Scalar_;
-  enum { UpLo = UpLo_ };
-
-  explicit DeviceTriangularView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
-  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
-
-  /** Build a TRSM solve expression. */
-  TrsmExpr<Scalar, UpLo_> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
-
- private:
-  const DeviceMatrix<Scalar>& mat_;
-};
-
-// ---- TrsmExpr: triangularView<UpLo>().solve(B) → cublasXtrsm ---------------
-
-template <typename Scalar_, int UpLo_>
-class TrsmExpr {
- public:
-  using Scalar = Scalar_;
-  enum { UpLo = UpLo_ };
-
-  TrsmExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
-  const DeviceMatrix<Scalar>& matrix() const { return A_; }
-  const DeviceMatrix<Scalar>& rhs() const { return B_; }
-
- private:
-  const DeviceMatrix<Scalar>& A_;
-  const DeviceMatrix<Scalar>& B_;
-};
-
-// ---- DeviceSelfAdjointView --------------------------------------------------
-// d_A.selfadjointView<Lower>() → view that can multiply: view * d_B
-
-template <typename Scalar_, int UpLo_>
-class DeviceSelfAdjointView {
- public:
-  using Scalar = Scalar_;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-  enum { UpLo = UpLo_ };
-
-  explicit DeviceSelfAdjointView(DeviceMatrix<Scalar>& m) : mat_(m) {}
-  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
-  DeviceMatrix<Scalar>& matrix() { return mat_; }
-
-  /** Rank-k update: C.selfadjointView<Lower>().rankUpdate(A, alpha)
-   * computes C = alpha * A * A^H + C (lower triangle only).
-   * Maps to cublasXsyrk (real) or cublasXherk (complex). */
-  void rankUpdate(const DeviceMatrix<Scalar>& A, RealScalar alpha = RealScalar(1));
-
- private:
-  DeviceMatrix<Scalar>& mat_;
-};
-
-// Const variant for multiplication only (no rankUpdate).
-template <typename Scalar_, int UpLo_>
-class ConstDeviceSelfAdjointView {
- public:
-  using Scalar = Scalar_;
-  enum { UpLo = UpLo_ };
-
-  explicit ConstDeviceSelfAdjointView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
-  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
-
- private:
-  const DeviceMatrix<Scalar>& mat_;
-};
-
-// ---- SymmExpr: selfadjointView<UpLo>() * B → cublasXsymm/Xhemm ------------
-
-template <typename Scalar_, int UpLo_>
-class SymmExpr {
- public:
-  using Scalar = Scalar_;
-  enum { UpLo = UpLo_ };
-
-  SymmExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
-  const DeviceMatrix<Scalar>& matrix() const { return A_; }
-  const DeviceMatrix<Scalar>& rhs() const { return B_; }
-
- private:
-  const DeviceMatrix<Scalar>& A_;
-  const DeviceMatrix<Scalar>& B_;
-};
-
-// operator*: DeviceSelfAdjointView * DeviceMatrix → SymmExpr (mutable and const variants)
-template <typename S, int UpLo>
-SymmExpr<S, UpLo> operator*(const DeviceSelfAdjointView<S, UpLo>& a, const DeviceMatrix<S>& b) {
-  return {a.matrix(), b};
-}
-template <typename S, int UpLo>
-SymmExpr<S, UpLo> operator*(const ConstDeviceSelfAdjointView<S, UpLo>& a, const DeviceMatrix<S>& b) {
-  return {a.matrix(), b};
-}
-
-// ---- SyrkExpr: rankUpdate(A) → cublasXsyrk/Xherk ---------------------------
-// C.rankUpdate(A) computes C += A * A^H (or A^H * A depending on convention).
-
-template <typename Scalar_, int UpLo_>
-class SyrkExpr {
- public:
-  using Scalar = Scalar_;
-  enum { UpLo = UpLo_ };
-
-  SyrkExpr(const DeviceMatrix<Scalar>& A) : A_(A) {}
-  const DeviceMatrix<Scalar>& matrix() const { return A_; }
-
- private:
-  const DeviceMatrix<Scalar>& A_;
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_GPU_DEVICE_BLAS_EXPR_H
--- a/Eigen/src/GPU/DeviceDispatch.h
+++ b/Eigen/src/GPU/DeviceDispatch.h
@@ -1,506 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// Dispatch functions that map DeviceMatrix expressions to NVIDIA library calls.
-//
-// dispatch_gemm()  — GemmExpr → cublasXgemm
-//
-// Each function documents the exact library call and parameters.
-
-#ifndef EIGEN_GPU_DEVICE_DISPATCH_H
-#define EIGEN_GPU_DEVICE_DISPATCH_H
-
-// IWYU pragma: private
-#include "./InternalHeaderCheck.h"
-
-#include "./DeviceExpr.h"
-#include "./DeviceBlasExpr.h"
-#include "./DeviceSolverExpr.h"
-#include "./GpuContext.h"
-#include "./CuSolverSupport.h"
-
-namespace Eigen {
-namespace internal {
-
-// ---- GEMM dispatch ----------------------------------------------------------
-// GemmExpr<Lhs, Rhs> → cublasGemmEx(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
-//
-// The generic API cublasGemmEx handles all scalar types (float, double,
-// complex<float>, complex<double>) via cudaDataType_t.
-
-template <typename Lhs, typename Rhs>
-void dispatch_gemm(
-    GpuContext& ctx, DeviceMatrix<typename device_expr_traits<Lhs>::scalar_type>& dst, const GemmExpr<Lhs, Rhs>& expr,
-    typename device_expr_traits<Lhs>::scalar_type beta_val,
-    typename device_expr_traits<Lhs>::scalar_type alpha_scale = typename device_expr_traits<Lhs>::scalar_type(1)) {
-  using Scalar = typename device_expr_traits<Lhs>::scalar_type;
-  using traits_lhs = device_expr_traits<Lhs>;
-  using traits_rhs = device_expr_traits<Rhs>;
-
-  const DeviceMatrix<Scalar>& A = traits_lhs::matrix(expr.lhs());
-  const DeviceMatrix<Scalar>& B = traits_rhs::matrix(expr.rhs());
-
-  constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op);
-  constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op);
-
-  // GEMM dimensions: C(m,n) = op(A)(m,k) * op(B)(k,n)
-  // op(A) has dimensions (A.rows, A.cols) if NoTrans, (A.cols, A.rows) if Trans/ConjTrans.
-  const int64_t m = (traits_lhs::op == GpuOp::NoTrans) ? A.rows() : A.cols();
-  const int64_t k = (traits_lhs::op == GpuOp::NoTrans) ? A.cols() : A.rows();
-  const int64_t n = (traits_rhs::op == GpuOp::NoTrans) ? B.cols() : B.rows();
-  const int64_t rhs_k = (traits_rhs::op == GpuOp::NoTrans) ? B.rows() : B.cols();
-
-  eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch");
-
-  const int64_t lda = A.outerStride();
-  const int64_t ldb = B.outerStride();
-
-  // Serialize all accesses to the destination buffer on this stream.
-  if (!dst.empty()) {
-    dst.waitReady(ctx.stream());
-  }
-
-  // Allocate or resize destination.
-  const bool resized = dst.empty() || dst.rows() != m || dst.cols() != n;
-  if (resized) {
-    dst.resize(m, n);
-  }
-  const int64_t ldc = dst.outerStride();
-
-  Scalar alpha_val = alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs());
-
-  // Wait for operands to be ready on this stream.
-  A.waitReady(ctx.stream());
-  B.waitReady(ctx.stream());
-
-  // If there is no existing valid destination to accumulate into, treat it as
-  // zero rather than reading uninitialized memory.
-  if (resized && beta_val != Scalar(0) && dst.sizeInBytes() > 0) {
-    EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
-  }
-
-  constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
-  constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
-
-  EIGEN_CUBLAS_CHECK(cublasGemmEx(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n),
-                                  static_cast<int>(k), &alpha_val, A.data(), dtype, static_cast<int>(lda), B.data(),
-                                  dtype, static_cast<int>(ldb), &beta_val, dst.data(), dtype, static_cast<int>(ldc),
-                                  compute, CUBLAS_GEMM_DEFAULT));
-
-  dst.recordReady(ctx.stream());
-}
-
-// ---- LLT solve dispatch -----------------------------------------------------
-// LltSolveExpr → cusolverDnXpotrf (factorize) + cusolverDnXpotrs (solve).
-// No caching — factor and workspace are temporary. Syncs to check info.
-
-template <typename Scalar, int UpLo>
-void dispatch_llt_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LltSolveExpr<Scalar, UpLo>& expr) {
-  const DeviceMatrix<Scalar>& A = expr.matrix();
-  const DeviceMatrix<Scalar>& B = expr.rhs();
-
-  eigen_assert(A.rows() == A.cols() && "LLT requires a square matrix");
-  eigen_assert(B.rows() == A.rows() && "LLT solve: RHS rows must match matrix size");
-
-  const Index n = A.rows();
-  const int64_t nrhs = static_cast<int64_t>(B.cols());
-
-  // Zero-size fast paths: no work, just resize dst.
-  // Wait on dst before resize to avoid freeing memory another stream is using.
-  if (n == 0 || nrhs == 0) {
-    if (!dst.empty()) dst.waitReady(ctx.stream());
-    dst.resize(n == 0 ? 0 : n, B.cols());
-    return;
-  }
-
-  A.waitReady(ctx.stream());
-  B.waitReady(ctx.stream());
-  if (!dst.empty()) dst.waitReady(ctx.stream());
-
-  constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
-  constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo, ColMajor>::value;
-  const int64_t lda = static_cast<int64_t>(A.outerStride());
-  const int64_t ldb = static_cast<int64_t>(B.outerStride());
-  eigen_assert(ldb == static_cast<int64_t>(B.rows()) && "DeviceMatrix must be densely packed");
-  const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
-  const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
-
-  // D2D copy A → factor buffer (potrf is in-place).
-  DeviceBuffer d_factor(mat_bytes);
-  EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_factor.ptr, A.data(), mat_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
-
-  // Query workspace and factorize.
-  CusolverParams params;
-  DeviceBuffer d_factorize_info(sizeof(int));
-  size_t dev_ws = 0, host_ws = 0;
-  EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), dtype,
-                                                   d_factor.ptr, lda, dtype, &dev_ws, &host_ws));
-
-  DeviceBuffer d_workspace(dev_ws);
-  std::vector<char> h_workspace(host_ws);
-
-  EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(
-      ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), dtype, d_factor.ptr, lda, dtype, d_workspace.ptr,
-      dev_ws, host_ws > 0 ? h_workspace.data() : nullptr, host_ws, static_cast<int*>(d_factorize_info.ptr)));
-
-  // Check factorization info before proceeding to solve.
-  int factorize_info = 0;
-  EIGEN_CUDA_RUNTIME_CHECK(
-      cudaMemcpyAsync(&factorize_info, d_factorize_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
-  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
-  eigen_assert(factorize_info == 0 && "cuSOLVER LLT factorization failed (matrix not positive definite)");
-
-  // D2D copy B → dst (potrs is in-place on the RHS).
-  dst.resize(n, B.cols());
-  EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
-
-  // Solve.
-  DeviceBuffer d_solve_info(sizeof(int));
-  EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), nrhs, dtype,
-                                        d_factor.ptr, lda, dtype, dst.data(), static_cast<int64_t>(dst.outerStride()),
-                                        static_cast<int*>(d_solve_info.ptr)));
-
-  // Sync to ensure workspace locals can be freed safely.
-  int solve_info = 0;
-  EIGEN_CUDA_RUNTIME_CHECK(
-      cudaMemcpyAsync(&solve_info, d_solve_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
-  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
-  eigen_assert(solve_info == 0 && "cuSOLVER LLT solve failed");
-
-  dst.recordReady(ctx.stream());
-}
-
-// ---- LU solve dispatch ------------------------------------------------------
-// LuSolveExpr → cusolverDnXgetrf (factorize) + cusolverDnXgetrs (solve).
-
-template <typename Scalar>
-void dispatch_lu_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LuSolveExpr<Scalar>& expr) {
-  const DeviceMatrix<Scalar>& A = expr.matrix();
-  const DeviceMatrix<Scalar>& B = expr.rhs();
-
-  eigen_assert(A.rows() == A.cols() && "LU requires a square matrix");
-  eigen_assert(B.rows() == A.rows() && "LU solve: RHS rows must match matrix size");
-
-  const Index n = A.rows();
-  const int64_t nrhs = static_cast<int64_t>(B.cols());
-
-  if (n == 0 || nrhs == 0) {
-    if (!dst.empty()) dst.waitReady(ctx.stream());
-    dst.resize(n == 0 ? 0 : n, B.cols());
-    return;
-  }
-
-  A.waitReady(ctx.stream());
-  B.waitReady(ctx.stream());
-  if (!dst.empty()) dst.waitReady(ctx.stream());
-
-  constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
-  const int64_t lda = static_cast<int64_t>(A.outerStride());
-  const int64_t ldb = static_cast<int64_t>(B.outerStride());
-  eigen_assert(ldb == static_cast<int64_t>(B.rows()) && "DeviceMatrix must be densely packed");
-  const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
-  const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
-  const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t);
-
-  // D2D copy A → LU buffer (getrf is in-place).
-  DeviceBuffer d_lu(mat_bytes);
-  EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu.ptr, A.data(), mat_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
-
-  DeviceBuffer d_ipiv(ipiv_bytes);
-
-  // Query workspace and factorize.
-  CusolverParams params;
-  DeviceBuffer d_factorize_info(sizeof(int));
-  size_t dev_ws = 0, host_ws = 0;
-  EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(ctx.cusolverHandle(), params.p, static_cast<int64_t>(n),
-                                                   static_cast<int64_t>(n), dtype, d_lu.ptr, lda, dtype, &dev_ws,
-                                                   &host_ws));
-
-  DeviceBuffer d_workspace(dev_ws);
-  std::vector<char> h_workspace(host_ws);
-
-  EIGEN_CUSOLVER_CHECK(
-      cusolverDnXgetrf(ctx.cusolverHandle(), params.p, static_cast<int64_t>(n), static_cast<int64_t>(n), dtype,
-                       d_lu.ptr, lda, static_cast<int64_t*>(d_ipiv.ptr), dtype, d_workspace.ptr, dev_ws,
-                       host_ws > 0 ? h_workspace.data() : nullptr, host_ws, static_cast<int*>(d_factorize_info.ptr)));
-
-  // Check factorization info before proceeding to solve.
-  int factorize_info = 0;
-  EIGEN_CUDA_RUNTIME_CHECK(
-      cudaMemcpyAsync(&factorize_info, d_factorize_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
-  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
-  eigen_assert(factorize_info == 0 && "cuSOLVER LU factorization failed (singular matrix)");
-
-  // D2D copy B → dst (getrs is in-place on the RHS).
-  dst.resize(n, B.cols());
-  EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
-
-  // Solve (NoTranspose).
-  DeviceBuffer d_solve_info(sizeof(int));
-  EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, static_cast<int64_t>(n), nrhs,
-                                        dtype, d_lu.ptr, lda, static_cast<const int64_t*>(d_ipiv.ptr), dtype,
-                                        dst.data(), static_cast<int64_t>(dst.outerStride()),
-                                        static_cast<int*>(d_solve_info.ptr)));
-
-  // Sync to ensure workspace locals can be freed safely.
-  int solve_info = 0;
-  EIGEN_CUDA_RUNTIME_CHECK(
-      cudaMemcpyAsync(&solve_info, d_solve_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
-  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
-  eigen_assert(solve_info == 0 && "cuSOLVER LU solve failed");
-
-  dst.recordReady(ctx.stream());
-}
-
-// ---- TRSM dispatch ----------------------------------------------------------
-// TrsmExpr → cublasXtrsm: solve op(A) * X = B where A is triangular.
-// Side=Left, Diag=NonUnit. A is square, B is n×nrhs.
-
-template <typename Scalar, int UpLo>
-void dispatch_trsm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const TrsmExpr<Scalar, UpLo>& expr) {
-  const DeviceMatrix<Scalar>& A = expr.matrix();
-  const DeviceMatrix<Scalar>& B = expr.rhs();
-
-  eigen_assert(A.rows() == A.cols() && "TRSM requires a square triangular matrix");
-  eigen_assert(B.rows() == A.rows() && "TRSM: RHS rows must match matrix size");
-
-  const int n = static_cast<int>(A.rows());
-  const int nrhs = static_cast<int>(B.cols());
-
-  if (n == 0 || nrhs == 0) {
-    if (!dst.empty()) dst.waitReady(ctx.stream());
-    dst.resize(n == 0 ? 0 : n, B.cols());
-    return;
-  }
-
-  A.waitReady(ctx.stream());
-  B.waitReady(ctx.stream());
-  if (!dst.empty()) dst.waitReady(ctx.stream());
-
-  // D2D copy B → dst (trsm is in-place on the RHS).
-  dst.resize(n, B.cols());
-  const size_t rhs_bytes = static_cast<size_t>(dst.outerStride()) * static_cast<size_t>(nrhs) * sizeof(Scalar);
-  EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
-
-  constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
-  Scalar alpha(1);
-
-  EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs,
-                                 &alpha, A.data(), static_cast<int>(A.outerStride()), dst.data(),
-                                 static_cast<int>(dst.outerStride())));
-
-  dst.recordReady(ctx.stream());
-}
-
-// ---- SYMM/HEMM dispatch -----------------------------------------------------
-// SymmExpr → cublasXsymm (real) or cublasXhemm (complex).
-// C = A * B where A is symmetric/Hermitian. Side=Left.
-
-template <typename Scalar, int UpLo>
-void dispatch_symm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SymmExpr<Scalar, UpLo>& expr) {
-  const DeviceMatrix<Scalar>& A = expr.matrix();
-  const DeviceMatrix<Scalar>& B = expr.rhs();
-
-  eigen_assert(A.rows() == A.cols() && "SYMM requires a square matrix");
-  eigen_assert(B.rows() == A.rows() && "SYMM: RHS rows must match matrix size");
-
-  const int m = static_cast<int>(A.rows());
-  const int n = static_cast<int>(B.cols());
-
-  if (m == 0 || n == 0) {
-    if (!dst.empty()) dst.waitReady(ctx.stream());
-    dst.resize(m == 0 ? 0 : m, B.cols());
-    return;
-  }
-
-  A.waitReady(ctx.stream());
-  B.waitReady(ctx.stream());
-  if (!dst.empty()) dst.waitReady(ctx.stream());
-
-  dst.resize(m, n);
-
-  constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
-  Scalar alpha(1), beta(0);
-
-  EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &alpha, A.data(),
-                                 static_cast<int>(A.outerStride()), B.data(), static_cast<int>(B.outerStride()), &beta,
-                                 dst.data(), static_cast<int>(dst.outerStride())));
-
-  dst.recordReady(ctx.stream());
-}
-
-// ---- SYRK/HERK dispatch -----------------------------------------------------
-// SyrkExpr → cublasXsyrk (real) or cublasXherk (complex).
-// C = alpha * A * A^H + beta * C. UpLo specifies which triangle of C is stored.
-
-template <typename Scalar, int UpLo>
-void dispatch_syrk(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SyrkExpr<Scalar, UpLo>& expr,
-                   typename NumTraits<Scalar>::Real alpha_val, typename NumTraits<Scalar>::Real beta_val) {
-  using RealScalar = typename NumTraits<Scalar>::Real;
-  const DeviceMatrix<Scalar>& A = expr.matrix();
-
-  const int n = static_cast<int>(A.rows());
-  const int k = static_cast<int>(A.cols());
-
-  if (n == 0) {
-    if (!dst.empty()) dst.waitReady(ctx.stream());
-    dst.resize(0, 0);
-    return;
-  }
-
-  A.waitReady(ctx.stream());
-  if (!dst.empty()) dst.waitReady(ctx.stream());
-
-  if (dst.empty() || dst.rows() != n || dst.cols() != n) {
-    dst.resize(n, n);
-    if (beta_val != RealScalar(0)) {
-      EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
-    }
-  }
-
-  constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
-
-  EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(),
-                                 static_cast<int>(A.outerStride()), &beta_val, dst.data(),
-                                 static_cast<int>(dst.outerStride())));
-
-  dst.recordReady(ctx.stream());
-}
-
-}  // namespace internal
-
-// ---- DeviceAssignment: d_C.device(ctx) = expr ------------------------------
-// Returned by DeviceMatrix::device(ctx). Dispatches expressions to library calls.
-
-template <typename Scalar_>
-class DeviceAssignment {
- public:
-  using Scalar = Scalar_;
-
-  DeviceAssignment(DeviceMatrix<Scalar>& dst, GpuContext& ctx) : dst_(dst), ctx_(ctx) {}
-
-  // operator= dispatches GEMM with beta=0 (overwrite).
-  template <typename Lhs, typename Rhs>
-  DeviceMatrix<Scalar>& operator=(const GemmExpr<Lhs, Rhs>& expr) {
-    internal::dispatch_gemm(ctx_, dst_, expr, Scalar(0));
-    return dst_;
-  }
-
-  // operator+= dispatches GEMM with beta=1 (accumulate).
-  template <typename Lhs, typename Rhs>
-  DeviceMatrix<Scalar>& operator+=(const GemmExpr<Lhs, Rhs>& expr) {
-    internal::dispatch_gemm(ctx_, dst_, expr, Scalar(1));
-    return dst_;
-  }
-
-  // operator-= dispatches GEMM with negated alpha, beta=1: C = C - alpha*op(A)*op(B).
-  template <typename Lhs, typename Rhs>
-  DeviceMatrix<Scalar>& operator-=(const GemmExpr<Lhs, Rhs>& expr) {
-    internal::dispatch_gemm(ctx_, dst_, expr, Scalar(1), Scalar(-1));
-    return dst_;
-  }
-
-  // operator= dispatches LLT solve (potrf + potrs).
-  template <int UpLo>
-  DeviceMatrix<Scalar>& operator=(const LltSolveExpr<Scalar, UpLo>& expr) {
-    internal::dispatch_llt_solve(ctx_, dst_, expr);
-    return dst_;
-  }
-
-  // operator= dispatches LU solve (getrf + getrs).
-  DeviceMatrix<Scalar>& operator=(const LuSolveExpr<Scalar>& expr) {
-    internal::dispatch_lu_solve(ctx_, dst_, expr);
-    return dst_;
-  }
-
-  // operator= dispatches TRSM (triangular solve).
-  template <int UpLo>
-  DeviceMatrix<Scalar>& operator=(const TrsmExpr<Scalar, UpLo>& expr) {
-    internal::dispatch_trsm(ctx_, dst_, expr);
-    return dst_;
-  }
-
-  // operator= dispatches SYMM/HEMM (symmetric/Hermitian multiply).
-  template <int UpLo>
-  DeviceMatrix<Scalar>& operator=(const SymmExpr<Scalar, UpLo>& expr) {
-    internal::dispatch_symm(ctx_, dst_, expr);
-    return dst_;
-  }
-
-  // Catch-all: static_assert for unsupported expressions.
-  template <typename Expr>
-  DeviceMatrix<Scalar>& operator=(const Expr&) {
-    static_assert(sizeof(Expr) == 0,
-                  "DeviceMatrix expression not supported: no cuBLAS/cuSOLVER mapping. "
-                  "Supported: GEMM (A*B), TRSM (.triangularView().solve()), "
-                  "SYMM (.selfadjointView()*B), LLT (.llt().solve()), LU (.lu().solve()).");
-    return dst_;
-  }
-
- private:
-  DeviceMatrix<Scalar>& dst_;
-  GpuContext& ctx_;
-};
-
-// ---- Out-of-line DeviceMatrix expression operator= definitions -------------
-// These are declared in DeviceMatrix.h but defined here because they need
-// GpuContext::threadLocal() which requires the full GpuContext definition.
-
-template <typename Scalar_>
-template <typename Lhs, typename Rhs>
-DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const GemmExpr<Lhs, Rhs>& expr) {
-  device(GpuContext::threadLocal()) = expr;
-  return *this;
-}
-
-template <typename Scalar_>
-template <typename Lhs, typename Rhs>
-DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const GemmExpr<Lhs, Rhs>& expr) {
-  device(GpuContext::threadLocal()) += expr;
-  return *this;
-}
-
-template <typename Scalar_>
-template <int UpLo>
-DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const LltSolveExpr<Scalar_, UpLo>& expr) {
-  device(GpuContext::threadLocal()) = expr;
-  return *this;
-}
-
-template <typename Scalar_>
-DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const LuSolveExpr<Scalar_>& expr) {
-  device(GpuContext::threadLocal()) = expr;
-  return *this;
-}
-
-template <typename Scalar_>
-template <int UpLo>
-DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const TrsmExpr<Scalar_, UpLo>& expr) {
-  device(GpuContext::threadLocal()) = expr;
-  return *this;
-}
-
-template <typename Scalar_>
-template <int UpLo>
-DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const SymmExpr<Scalar_, UpLo>& expr) {
-  device(GpuContext::threadLocal()) = expr;
-  return *this;
-}
-
-// DeviceSelfAdjointView::rankUpdate — defined here because it needs GpuContext.
-template <typename Scalar_, int UpLo_>
-void DeviceSelfAdjointView<Scalar_, UpLo_>::rankUpdate(const DeviceMatrix<Scalar_>& A, RealScalar alpha) {
-  SyrkExpr<Scalar_, UpLo_> expr(A);
-  RealScalar beta = matrix().empty() ? RealScalar(0) : RealScalar(1);
-  internal::dispatch_syrk(GpuContext::threadLocal(), matrix(), expr, alpha, beta);
-}
-
-}  // namespace Eigen
-
-#endif  // EIGEN_GPU_DEVICE_DISPATCH_H
--- a/Eigen/src/GPU/DeviceExpr.h
+++ b/Eigen/src/GPU/DeviceExpr.h
@@ -1,224 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// Lightweight expression types for DeviceMatrix operations.
-//
-// These are NOT Eigen expression templates. Each type maps 1:1 to a single
-// NVIDIA library call (cuBLAS or cuSOLVER). There is no coefficient-level
-// evaluation, no lazy fusion, no packet operations.
-//
-// Expression types:
-//   DeviceAdjointView<S>  — d_A.adjoint()  → marks ConjTrans for GEMM
-//   DeviceTransposeView<S> — d_A.transpose() → marks Trans for GEMM
-//   DeviceScaled<Expr>    — alpha * expr    → carries scalar factor
-//   GemmExpr<Lhs, Rhs>   — lhs * rhs       → dispatches to cublasXgemm
-
-#ifndef EIGEN_GPU_DEVICE_EXPR_H
-#define EIGEN_GPU_DEVICE_EXPR_H
-
-// IWYU pragma: private
-#include "./InternalHeaderCheck.h"
-
-#include "./CuBlasSupport.h"
-
-namespace Eigen {
-
-// Forward declaration.
-template <typename Scalar_>
-class DeviceMatrix;
-
-namespace internal {
-
-// ---- Traits: extract operation info from expression types -------------------
-
-// Default: a DeviceMatrix is NoTrans.
-template <typename T>
-struct device_expr_traits {
-  static constexpr bool is_device_expr = false;
-};
-
-template <typename Scalar>
-struct device_expr_traits<DeviceMatrix<Scalar>> {
-  using scalar_type = Scalar;
-  static constexpr GpuOp op = GpuOp::NoTrans;
-  static constexpr bool is_device_expr = true;
-  static const DeviceMatrix<Scalar>& matrix(const DeviceMatrix<Scalar>& x) { return x; }
-  static Scalar alpha(const DeviceMatrix<Scalar>&) { return Scalar(1); }
-};
-
-}  // namespace internal
-
-// ---- DeviceAdjointView: marks ConjTrans ------------------------------------
-// Returned by DeviceMatrix::adjoint(). Maps to cublasXgemm transA/B = C.
-
-template <typename Scalar_>
-class DeviceAdjointView {
- public:
-  using Scalar = Scalar_;
-  explicit DeviceAdjointView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
-  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
-
- private:
-  const DeviceMatrix<Scalar>& mat_;
-};
-
-namespace internal {
-template <typename Scalar>
-struct device_expr_traits<DeviceAdjointView<Scalar>> {
-  using scalar_type = Scalar;
-  static constexpr GpuOp op = GpuOp::ConjTrans;
-  static constexpr bool is_device_expr = true;
-  static const DeviceMatrix<Scalar>& matrix(const DeviceAdjointView<Scalar>& x) { return x.matrix(); }
-  static Scalar alpha(const DeviceAdjointView<Scalar>&) { return Scalar(1); }
-};
-}  // namespace internal
-
-// ---- DeviceTransposeView: marks Trans --------------------------------------
-// Returned by DeviceMatrix::transpose(). Maps to cublasXgemm transA/B = T.
-
-template <typename Scalar_>
-class DeviceTransposeView {
- public:
-  using Scalar = Scalar_;
-  explicit DeviceTransposeView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
-  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
-
- private:
-  const DeviceMatrix<Scalar>& mat_;
-};
-
-namespace internal {
-template <typename Scalar>
-struct device_expr_traits<DeviceTransposeView<Scalar>> {
-  using scalar_type = Scalar;
-  static constexpr GpuOp op = GpuOp::Trans;
-  static constexpr bool is_device_expr = true;
-  static const DeviceMatrix<Scalar>& matrix(const DeviceTransposeView<Scalar>& x) { return x.matrix(); }
-  static Scalar alpha(const DeviceTransposeView<Scalar>&) { return Scalar(1); }
-};
-}  // namespace internal
-
-// ---- DeviceScaled: alpha * expr --------------------------------------------
-// Returned by operator*(Scalar, DeviceMatrix/View). Carries the scalar factor.
-
-template <typename Inner>
-class DeviceScaled {
- public:
-  using Scalar = typename internal::device_expr_traits<Inner>::scalar_type;
-  DeviceScaled(Scalar alpha, const Inner& inner) : alpha_(alpha), inner_(inner) {}
-  Scalar scalar() const { return alpha_; }
-  const Inner& inner() const { return inner_; }
-
- private:
-  Scalar alpha_;
-  const Inner& inner_;
-};
-
-namespace internal {
-template <typename Inner>
-struct device_expr_traits<DeviceScaled<Inner>> {
-  using scalar_type = typename device_expr_traits<Inner>::scalar_type;
-  static constexpr GpuOp op = device_expr_traits<Inner>::op;
-  static constexpr bool is_device_expr = true;
-  static const DeviceMatrix<scalar_type>& matrix(const DeviceScaled<Inner>& x) {
-    return device_expr_traits<Inner>::matrix(x.inner());
-  }
-  static scalar_type alpha(const DeviceScaled<Inner>& x) {
-    return x.scalar() * device_expr_traits<Inner>::alpha(x.inner());
-  }
-};
-}  // namespace internal
-
-// ---- GemmExpr: lhs * rhs → cublasXgemm ------------------------------------
-// Returned by operator*(lhs_expr, rhs_expr). Dispatches to cuBLAS GEMM.
-
-template <typename Lhs, typename Rhs>
-class GemmExpr {
- public:
-  using Scalar = typename internal::device_expr_traits<Lhs>::scalar_type;
-  static_assert(std::is_same<Scalar, typename internal::device_expr_traits<Rhs>::scalar_type>::value,
-                "DeviceMatrix GEMM: LHS and RHS must have the same scalar type");
-
-  GemmExpr(const Lhs& lhs, const Rhs& rhs) : lhs_(lhs), rhs_(rhs) {}
-  const Lhs& lhs() const { return lhs_; }
-  const Rhs& rhs() const { return rhs_; }
-
- private:
-  // Stored by reference. Expression objects must not outlive their operands.
-  // This is safe for the one-liner pattern (d_C = d_A * d_B) since all
-  // temporaries live until the semicolon.
-  const Lhs& lhs_;
-  const Rhs& rhs_;
-};
-
-// ---- Free operator* overloads that produce GemmExpr ------------------------
-// These cover: DM*DM, Adj*DM, DM*Adj, Trans*DM, DM*Trans, Scaled*DM, etc.
-
-// DeviceMatrix * DeviceMatrix
-template <typename S>
-GemmExpr<DeviceMatrix<S>, DeviceMatrix<S>> operator*(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
-  return {a, b};
-}
-
-// AdjointView * DeviceMatrix
-template <typename S>
-GemmExpr<DeviceAdjointView<S>, DeviceMatrix<S>> operator*(const DeviceAdjointView<S>& a, const DeviceMatrix<S>& b) {
-  return {a, b};
-}
-
-// DeviceMatrix * AdjointView
-template <typename S>
-GemmExpr<DeviceMatrix<S>, DeviceAdjointView<S>> operator*(const DeviceMatrix<S>& a, const DeviceAdjointView<S>& b) {
-  return {a, b};
-}
-
-// TransposeView * DeviceMatrix
-template <typename S>
-GemmExpr<DeviceTransposeView<S>, DeviceMatrix<S>> operator*(const DeviceTransposeView<S>& a, const DeviceMatrix<S>& b) {
-  return {a, b};
-}
-
-// DeviceMatrix * TransposeView
-template <typename S>
-GemmExpr<DeviceMatrix<S>, DeviceTransposeView<S>> operator*(const DeviceMatrix<S>& a, const DeviceTransposeView<S>& b) {
-  return {a, b};
-}
-
-// Scaled * DeviceMatrix
-template <typename Inner, typename S>
-GemmExpr<DeviceScaled<Inner>, DeviceMatrix<S>> operator*(const DeviceScaled<Inner>& a, const DeviceMatrix<S>& b) {
-  return {a, b};
-}
-
-// DeviceMatrix * Scaled
-template <typename S, typename Inner>
-GemmExpr<DeviceMatrix<S>, DeviceScaled<Inner>> operator*(const DeviceMatrix<S>& a, const DeviceScaled<Inner>& b) {
-  return {a, b};
-}
-
-// ---- Scalar * DeviceMatrix / View → DeviceScaled ---------------------------
-
-template <typename S>
-DeviceScaled<DeviceMatrix<S>> operator*(S alpha, const DeviceMatrix<S>& m) {
-  return {alpha, m};
-}
-
-template <typename S>
-DeviceScaled<DeviceAdjointView<S>> operator*(S alpha, const DeviceAdjointView<S>& m) {
-  return {alpha, m};
-}
-
-template <typename S>
-DeviceScaled<DeviceTransposeView<S>> operator*(S alpha, const DeviceTransposeView<S>& m) {
-  return {alpha, m};
-}
-
-}  // namespace Eigen
-
-#endif  // EIGEN_GPU_DEVICE_EXPR_H
--- a/Eigen/src/GPU/DeviceMatrix.h
+++ b/Eigen/src/GPU/DeviceMatrix.h
@@ -1,517 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// Typed RAII wrapper for a dense matrix in GPU device memory.
-//
-// DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked
-// dimensions and leading dimension. It can be passed to GPU solvers
-// (GpuLLT, GpuLU, future cuBLAS/cuDSS) without host round-trips.
-//
-// Cross-stream safety is automatic: an internal CUDA event tracks when the
-// last write completed. Consumers on a different stream wait on that event
-// before reading.
-//
-// Usage:
-//   auto d_A = DeviceMatrix<double>::fromHost(A);   // upload (sync)
-//   GpuLLT<double> llt;
-//   llt.compute(d_A);                                // factor on device
-//   auto d_X = llt.solve(d_B);                       // async, no sync
-//   MatrixXd X = d_X.toHost();                       // download + block
-//
-// Async variants:
-//   auto d_A = DeviceMatrix<double>::fromHostAsync(A.data(), n, n, n, stream);
-//   auto transfer = d_X.toHostAsync(stream);         // enqueue D2H
-//   // ... overlap with other work ...
-//   MatrixXd X = transfer.get();                     // block + retrieve
-
-#ifndef EIGEN_GPU_DEVICE_MATRIX_H
-#define EIGEN_GPU_DEVICE_MATRIX_H
-
-// IWYU pragma: private
-#include "./InternalHeaderCheck.h"
-
-#include "./GpuSupport.h"
-
-namespace Eigen {
-
-// Forward declarations.
-template <typename, int>
-class GpuLLT;
-template <typename>
-class GpuLU;
-template <typename>
-class DeviceAdjointView;
-template <typename>
-class DeviceTransposeView;
-template <typename>
-class DeviceAssignment;
-template <typename, typename>
-class GemmExpr;
-template <typename, int>
-class LltSolveExpr;
-template <typename>
-class LuSolveExpr;
-template <typename, int>
-class DeviceLLTView;
-template <typename>
-class DeviceLUView;
-template <typename, int>
-class DeviceTriangularView;
-template <typename, int>
-class DeviceSelfAdjointView;
-template <typename, int>
-class ConstDeviceSelfAdjointView;
-template <typename, int>
-class TrsmExpr;
-template <typename, int>
-class SymmExpr;
-template <typename, int>
-class SyrkExpr;
-class GpuContext;
-
-// --------------------------------------------------------------------------
-// HostTransfer — future-like wrapper for an async device-to-host transfer.
-// --------------------------------------------------------------------------
-
-/** \ingroup GPU_Module
- * \class HostTransfer
- * \brief Future for an asynchronous device-to-host matrix transfer.
- *
- * Returned by DeviceMatrix::toHostAsync(). The transfer runs asynchronously
- * on the given CUDA stream. Call get() to block until complete and retrieve
- * the host matrix, or ready() to poll without blocking.
- */
-template <typename Scalar_>
-class HostTransfer {
- public:
-  using Scalar = Scalar_;
-  using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
-
-  /** Block until the transfer completes and return the host matrix.
-   * Idempotent: subsequent calls return the same matrix without re-syncing. */
-  PlainMatrix& get() {
-    if (!synced_) {
-      EIGEN_CUDA_RUNTIME_CHECK(cudaEventSynchronize(event_));
-      synced_ = true;
-    }
-    return host_buf_;
-  }
-
-  /** Non-blocking check: has the transfer completed? */
-  bool ready() const {
-    if (synced_) return true;
-    cudaError_t err = cudaEventQuery(event_);
-    if (err == cudaSuccess) return true;
-    eigen_assert(err == cudaErrorNotReady && "cudaEventQuery failed");
-    return false;
-  }
-
-  ~HostTransfer() {
-    if (event_) (void)cudaEventDestroy(event_);
-  }
-
-  HostTransfer(HostTransfer&& o) noexcept : host_buf_(std::move(o.host_buf_)), event_(o.event_), synced_(o.synced_) {
-    o.event_ = nullptr;
-    o.synced_ = true;
-  }
-
-  HostTransfer& operator=(HostTransfer&& o) noexcept {
-    if (this != &o) {
-      if (event_) (void)cudaEventDestroy(event_);
-      host_buf_ = std::move(o.host_buf_);
-      event_ = o.event_;
-      synced_ = o.synced_;
-      o.event_ = nullptr;
-      o.synced_ = true;
-    }
-    return *this;
-  }
-
-  HostTransfer(const HostTransfer&) = delete;
-  HostTransfer& operator=(const HostTransfer&) = delete;
-
- private:
-  template <typename>
-  friend class DeviceMatrix;
-
-  HostTransfer(PlainMatrix&& buf, cudaEvent_t event) : host_buf_(std::move(buf)), event_(event), synced_(false) {}
-
-  PlainMatrix host_buf_;
-  cudaEvent_t event_ = nullptr;
-  bool synced_ = false;
-};
-
-// --------------------------------------------------------------------------
-// DeviceMatrix — typed RAII wrapper for a dense matrix in device memory.
-// --------------------------------------------------------------------------
-
-/** \ingroup GPU_Module
- * \class DeviceMatrix
- * \brief RAII wrapper for a dense column-major matrix in GPU device memory.
- *
- * \tparam Scalar_  Element type: float, double, complex<float>, complex<double>
- *
- * Owns a device allocation with tracked dimensions and leading dimension.
- * An internal CUDA event records when the data was last written, enabling
- * safe cross-stream consumption without user-visible synchronization.
- *
- * Each method has a synchronous and an asynchronous variant:
- *  - fromHost() / fromHostAsync(): upload from host
- *  - toHost() / toHostAsync(): download to host
- */
-template <typename Scalar_>
-class DeviceMatrix {
- public:
-  using Scalar = Scalar_;
-  using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
-
-  // ---- Construction / destruction ------------------------------------------
-
-  /** Default: empty (0x0, no allocation). */
-  DeviceMatrix() = default;
-
-  /** Allocate uninitialized device memory for a rows x cols matrix. */
-  DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols), outerStride_(rows) {
-    eigen_assert(rows >= 0 && cols >= 0);
-    size_t bytes = sizeInBytes();
-    if (bytes > 0) {
-      EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
-    }
-  }
-
-  ~DeviceMatrix() {
-    if (data_) (void)cudaFree(data_);
-    if (ready_event_) (void)cudaEventDestroy(ready_event_);
-  }
-
-  // ---- Move-only -----------------------------------------------------------
-
-  DeviceMatrix(DeviceMatrix&& o) noexcept
-      : data_(o.data_),
-        rows_(o.rows_),
-        cols_(o.cols_),
-        outerStride_(o.outerStride_),
-        ready_event_(o.ready_event_),
-        ready_stream_(o.ready_stream_),
-        retained_buffer_(std::move(o.retained_buffer_)) {
-    o.data_ = nullptr;
-    o.rows_ = 0;
-    o.cols_ = 0;
-    o.outerStride_ = 0;
-    o.ready_event_ = nullptr;
-    o.ready_stream_ = nullptr;
-  }
-
-  DeviceMatrix& operator=(DeviceMatrix&& o) noexcept {
-    if (this != &o) {
-      if (data_) (void)cudaFree(data_);
-      if (ready_event_) (void)cudaEventDestroy(ready_event_);
-      data_ = o.data_;
-      rows_ = o.rows_;
-      cols_ = o.cols_;
-      outerStride_ = o.outerStride_;
-      ready_event_ = o.ready_event_;
-      ready_stream_ = o.ready_stream_;
-      retained_buffer_ = std::move(o.retained_buffer_);
-      o.data_ = nullptr;
-      o.rows_ = 0;
-      o.cols_ = 0;
-      o.outerStride_ = 0;
-      o.ready_event_ = nullptr;
-      o.ready_stream_ = nullptr;
-    }
-    return *this;
-  }
-
-  DeviceMatrix(const DeviceMatrix&) = delete;
-  DeviceMatrix& operator=(const DeviceMatrix&) = delete;
-
-  // ---- Upload from host ----------------------------------------------------
-
-  /** Upload a host Eigen matrix to device memory (synchronous).
-   *
-   * Evaluates the expression into a contiguous ColMajor temporary, copies to
-   * device via cudaMemcpyAsync on \p stream, and synchronizes before returning.
-   *
-   * \param host   Any Eigen matrix expression.
-   * \param stream CUDA stream for the transfer (default: stream 0).
-   */
-  template <typename Derived>
-  static DeviceMatrix fromHost(const MatrixBase<Derived>& host, cudaStream_t stream = nullptr) {
-    const PlainMatrix mat(host.derived());
-    DeviceMatrix dm(mat.rows(), mat.cols());
-    if (dm.sizeInBytes() > 0) {
-      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dm.data_, mat.data(), dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
-      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
-    }
-    return dm;
-  }
-
-  /** Upload from a raw host pointer to device memory (asynchronous).
-   *
-   * Enqueues an async H2D copy on \p stream and records an internal event.
-   * The caller must keep \p host_data alive until the transfer completes
-   * (check via the internal event or synchronize the stream).
-   *
-   * \param host_data    Pointer to contiguous column-major host data.
-   * \param rows         Number of rows.
-   * \param cols         Number of columns.
-   * \param outerStride  Leading dimension (>= rows). Use rows for dense.
-   * \param stream       CUDA stream for the transfer.
-   */
-  static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, Index outerStride,
-                                    cudaStream_t stream) {
-    eigen_assert(rows >= 0 && cols >= 0 && outerStride >= rows);
-    eigen_assert(host_data != nullptr || (rows == 0 || cols == 0));
-    DeviceMatrix dm(rows, cols);
-    if (dm.sizeInBytes() > 0) {
-      // If outerStride == rows (dense), single contiguous copy.
-      // Otherwise, copy column by column (strided layout).
-      if (outerStride == rows) {
-        EIGEN_CUDA_RUNTIME_CHECK(
-            cudaMemcpyAsync(dm.data_, host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
-      } else {
-        EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(dm.data_, static_cast<size_t>(rows) * sizeof(Scalar), host_data,
-                                                   static_cast<size_t>(outerStride) * sizeof(Scalar),
-                                                   static_cast<size_t>(rows) * sizeof(Scalar),
-                                                   static_cast<size_t>(cols), cudaMemcpyHostToDevice, stream));
-      }
-      dm.recordReady(stream);
-    }
-    return dm;
-  }
-
-  // ---- Download to host ----------------------------------------------------
-
-  /** Download device matrix to host memory (synchronous).
-   *
-   * Waits on the internal ready event, enqueues a D2H copy on \p stream,
-   * synchronizes, and returns the host matrix directly.
-   *
-   * \param stream CUDA stream for the transfer (default: stream 0).
-   */
-  PlainMatrix toHost(cudaStream_t stream = nullptr) const {
-    PlainMatrix host_buf(rows_, cols_);
-    if (sizeInBytes() > 0) {
-      waitReady(stream);
-      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(host_buf.data(), data_, sizeInBytes(), cudaMemcpyDeviceToHost, stream));
-      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
-    }
-    return host_buf;
-  }
-
-  /** Enqueue an async device-to-host transfer and return a future.
-   *
-   * Waits on the internal ready event (if any) to ensure the device data is
-   * valid, then enqueues the D2H copy on \p stream. Returns a HostTransfer
-   * future; call .get() to block and retrieve the host matrix.
-   *
-   * \param stream CUDA stream for the transfer (default: stream 0).
-   */
-  HostTransfer<Scalar> toHostAsync(cudaStream_t stream = nullptr) const {
-    PlainMatrix host_buf(rows_, cols_);
-    if (sizeInBytes() > 0) {
-      waitReady(stream);
-      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(host_buf.data(), data_, sizeInBytes(), cudaMemcpyDeviceToHost, stream));
-    }
-    // Record a transfer-complete event.
-    cudaEvent_t transfer_event;
-    EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&transfer_event, cudaEventDisableTiming));
-    EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(transfer_event, stream));
-    return HostTransfer<Scalar>(std::move(host_buf), transfer_event);
-  }
-
-  // ---- Device-to-device copy -----------------------------------------------
-
-  /** Deep copy on device. Fully async — records event on the result, no sync.
-   *
-   * \param stream CUDA stream for the D2D copy (default: stream 0).
-   */
-  DeviceMatrix clone(cudaStream_t stream = nullptr) const {
-    DeviceMatrix result(rows_, cols_);
-    if (sizeInBytes() > 0) {
-      waitReady(stream);
-      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data_, data_, sizeInBytes(), cudaMemcpyDeviceToDevice, stream));
-      result.recordReady(stream);
-    }
-    return result;
-  }
-
-  // ---- Resize (destructive) ------------------------------------------------
-
-  /** Discard contents and reallocate to (rows x cols). Clears the ready event. */
-  void resize(Index rows, Index cols) {
-    if (rows == rows_ && cols == cols_) return;
-    if (data_) {
-      (void)cudaFree(data_);
-      data_ = nullptr;
-    }
-    if (ready_event_) {
-      (void)cudaEventDestroy(ready_event_);
-      ready_event_ = nullptr;
-    }
-    ready_stream_ = nullptr;
-    retained_buffer_ = internal::DeviceBuffer();
-    rows_ = rows;
-    cols_ = cols;
-    outerStride_ = rows;
-    size_t bytes = sizeInBytes();
-    if (bytes > 0) {
-      EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
-    }
-  }
-
-  // ---- Accessors -----------------------------------------------------------
-
-  Scalar* data() { return data_; }
-  const Scalar* data() const { return data_; }
-  Index rows() const { return rows_; }
-  Index cols() const { return cols_; }
-  Index outerStride() const { return outerStride_; }
-  bool empty() const { return rows_ == 0 || cols_ == 0; }
-
-  /** Size of the device allocation in bytes. */
-  size_t sizeInBytes() const { return static_cast<size_t>(outerStride_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
-
-  // ---- Event synchronization (public for library dispatch interop) ---------
-
-  /** Record that device data is ready after work on \p stream. */
-  void recordReady(cudaStream_t stream) {
-    ensureEvent();
-    EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(ready_event_, stream));
-    ready_stream_ = stream;
-  }
-
-  /** Make \p stream wait until the device data is ready.
-   * No-op if no event recorded, or if the consumer stream is the same as the
-   * producer stream (CUDA guarantees in-order execution within a stream). */
-  void waitReady(cudaStream_t stream) const {
-    if (ready_event_ && stream != ready_stream_) {
-      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamWaitEvent(stream, ready_event_, 0));
-    }
-  }
-
-  // ---- Expression methods (dispatch to cuBLAS/cuSOLVER) --------------------
-
-  /** Adjoint view for GEMM dispatch. Maps to cublasXgemm with ConjTrans. */
-  DeviceAdjointView<Scalar> adjoint() const { return DeviceAdjointView<Scalar>(*this); }
-
-  /** Transpose view for GEMM dispatch. Maps to cublasXgemm with Trans. */
-  DeviceTransposeView<Scalar> transpose() const { return DeviceTransposeView<Scalar>(*this); }
-
-  /** Bind this matrix to a GpuContext for expression assignment.
-   * Returns a DeviceAssignment proxy: `d_C.device(ctx) = d_A * d_B;` */
-  DeviceAssignment<Scalar> device(GpuContext& ctx) { return DeviceAssignment<Scalar>(*this, ctx); }
-
-  /** Assign from a GEMM expression using the thread-local default GpuContext.
-   * Defined out-of-line after GpuContext is fully declared (see DeviceDispatch.h). */
-  template <typename Lhs, typename Rhs>
-  DeviceMatrix& operator=(const GemmExpr<Lhs, Rhs>& expr);
-
-  /** Accumulate from a GEMM expression using the thread-local default GpuContext. */
-  template <typename Lhs, typename Rhs>
-  DeviceMatrix& operator+=(const GemmExpr<Lhs, Rhs>& expr);
-
-  /** Cholesky view: d_A.llt().solve(d_B) → LltSolveExpr. */
-  DeviceLLTView<Scalar, Lower> llt() const { return DeviceLLTView<Scalar, Lower>(*this); }
-
-  /** Cholesky view with explicit triangle: d_A.llt<Upper>().solve(d_B). */
-  template <int UpLo>
-  DeviceLLTView<Scalar, UpLo> llt() const {
-    return DeviceLLTView<Scalar, UpLo>(*this);
-  }
-
-  /** LU view: d_A.lu().solve(d_B) → LuSolveExpr. */
-  DeviceLUView<Scalar> lu() const { return DeviceLUView<Scalar>(*this); }
-
-  /** Assign from an LLT solve expression (thread-local default context). */
-  template <int UpLo>
-  DeviceMatrix& operator=(const LltSolveExpr<Scalar, UpLo>& expr);
-
-  /** Assign from an LU solve expression (thread-local default context). */
-  DeviceMatrix& operator=(const LuSolveExpr<Scalar>& expr);
-
-  /** Triangular view: d_A.triangularView<Lower>().solve(d_B) → TrsmExpr. */
-  template <int UpLo>
-  DeviceTriangularView<Scalar, UpLo> triangularView() const {
-    return DeviceTriangularView<Scalar, UpLo>(*this);
-  }
-
-  /** Self-adjoint view (mutable): d_C.selfadjointView<Lower>().rankUpdate(d_A). */
-  template <int UpLo>
-  DeviceSelfAdjointView<Scalar, UpLo> selfadjointView() {
-    return DeviceSelfAdjointView<Scalar, UpLo>(*this);
-  }
-
-  /** Self-adjoint view (const): d_A.selfadjointView<Lower>() * d_B → SymmExpr. */
-  template <int UpLo>
-  ConstDeviceSelfAdjointView<Scalar, UpLo> selfadjointView() const {
-    return ConstDeviceSelfAdjointView<Scalar, UpLo>(*this);
-  }
-
-  /** Assign from a TRSM expression (thread-local default context). */
-  template <int UpLo>
-  DeviceMatrix& operator=(const TrsmExpr<Scalar, UpLo>& expr);
-
-  /** Assign from a SYMM expression (thread-local default context). */
-  template <int UpLo>
-  DeviceMatrix& operator=(const SymmExpr<Scalar, UpLo>& expr);
-
- private:
-  // ---- Private: adopt a raw device pointer (used by friend solvers) --------
-
-  DeviceMatrix(Scalar* device_ptr, Index rows, Index cols, Index outerStride)
-      : data_(device_ptr), rows_(rows), cols_(cols), outerStride_(outerStride) {}
-
-  /** Transfer ownership of the device pointer out. Zeros internal state. */
-  Scalar* release() {
-    Scalar* p = data_;
-    data_ = nullptr;
-    rows_ = 0;
-    cols_ = 0;
-    outerStride_ = 0;
-    if (ready_event_) {
-      (void)cudaEventDestroy(ready_event_);
-      ready_event_ = nullptr;
-    }
-    ready_stream_ = nullptr;
-    return p;
-  }
-
-  // ---- Private helpers -------------------------------------------------------
-
-  void ensureEvent() {
-    if (!ready_event_) {
-      EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&ready_event_, cudaEventDisableTiming));
-    }
-  }
-
-  void retainBuffer(internal::DeviceBuffer&& buffer) { retained_buffer_ = std::move(buffer); }
-
-  // ---- Friend declarations ------------------------------------------------
-
-  template <typename, int>
-  friend class GpuLLT;
-  template <typename>
-  friend class GpuLU;
-
-  // ---- Data members --------------------------------------------------------
-
-  Scalar* data_ = nullptr;
-  Index rows_ = 0;
-  Index cols_ = 0;
-  Index outerStride_ = 0;
-  cudaEvent_t ready_event_ = nullptr;       // internal: tracks last write completion
-  cudaStream_t ready_stream_ = nullptr;     // stream that recorded ready_event_ (for same-stream skip)
-  internal::DeviceBuffer retained_buffer_;  // internal: keeps async aux buffers alive
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_GPU_DEVICE_MATRIX_H
--- a/Eigen/src/GPU/DeviceSolverExpr.h
+++ b/Eigen/src/GPU/DeviceSolverExpr.h
@@ -1,115 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// Solver expression types for DeviceMatrix.
-//
-// Each expression maps 1:1 to cuSOLVER library calls:
-//   LltSolveExpr  → cusolverDnXpotrf + cusolverDnXpotrs
-//   LuSolveExpr   → cusolverDnXgetrf + cusolverDnXgetrs
-//
-// Usage:
-//   d_X = d_A.llt().solve(d_B);              // Cholesky solve
-//   d_X.device(ctx) = d_A.lu().solve(d_B);   // LU solve on explicit stream
-
-#ifndef EIGEN_GPU_DEVICE_SOLVER_EXPR_H
-#define EIGEN_GPU_DEVICE_SOLVER_EXPR_H
-
-// IWYU pragma: private
-#include "./InternalHeaderCheck.h"
-
-namespace Eigen {
-
-// Forward declarations.
-template <typename Scalar_>
-class DeviceMatrix;
-class GpuContext;
-
-// ---- LLT solve expression ---------------------------------------------------
-// d_A.llt().solve(d_B) → LltSolveExpr → cusolverDnXpotrf + cusolverDnXpotrs
-
-template <typename Scalar_, int UpLo_ = Lower>
-class LltSolveExpr {
- public:
-  using Scalar = Scalar_;
-  enum { UpLo = UpLo_ };
-
-  LltSolveExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
-  const DeviceMatrix<Scalar>& matrix() const { return A_; }
-  const DeviceMatrix<Scalar>& rhs() const { return B_; }
-
- private:
-  const DeviceMatrix<Scalar>& A_;
-  const DeviceMatrix<Scalar>& B_;
-};
-
-// ---- LU solve expression ----------------------------------------------------
-// d_A.lu().solve(d_B) → LuSolveExpr → cusolverDnXgetrf + cusolverDnXgetrs
-
-template <typename Scalar_>
-class LuSolveExpr {
- public:
-  using Scalar = Scalar_;
-
-  LuSolveExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
-  const DeviceMatrix<Scalar>& matrix() const { return A_; }
-  const DeviceMatrix<Scalar>& rhs() const { return B_; }
-
- private:
-  const DeviceMatrix<Scalar>& A_;
-  const DeviceMatrix<Scalar>& B_;
-};
-
-// ---- DeviceLLTView: d_A.llt() → view with .solve() and .device() -----------
-
-template <typename Scalar_, int UpLo_ = Lower>
-class DeviceLLTView {
- public:
-  using Scalar = Scalar_;
-
-  explicit DeviceLLTView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
-
-  /** Build a solve expression: d_A.llt().solve(d_B).
-   * The expression is evaluated when assigned to a DeviceMatrix. */
-  LltSolveExpr<Scalar, UpLo_> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
-
-  // For cached factorizations, use the explicit GpuLLT API directly:
-  //   GpuLLT<double> llt;
-  //   llt.compute(d_A);
-  //   auto d_X1 = llt.solve(d_B1);
-  //   auto d_X2 = llt.solve(d_B2);
-
- private:
-  const DeviceMatrix<Scalar>& mat_;
-};
-
-// ---- DeviceLUView: d_A.lu() → view with .solve() and .device() -------------
-
-template <typename Scalar_>
-class DeviceLUView {
- public:
-  using Scalar = Scalar_;
-
-  explicit DeviceLUView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
-
-  /** Build a solve expression: d_A.lu().solve(d_B). */
-  LuSolveExpr<Scalar> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
-
-  // For cached factorizations, use the explicit GpuLU API directly:
-  //   GpuLU<double> lu;
-  //   lu.compute(d_A);
-  //   auto d_X1 = lu.solve(d_B1);
-  //   auto d_X2 = lu.solve(d_B2);
-
- private:
-  const DeviceMatrix<Scalar>& mat_;
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_GPU_DEVICE_SOLVER_EXPR_H
--- a/Eigen/src/GPU/GpuContext.h
+++ b/Eigen/src/GPU/GpuContext.h
@@ -1,83 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// Unified GPU execution context.
-//
-// GpuContext owns a CUDA stream and all NVIDIA library handles (cuBLAS,
-// cuSOLVER, future cuDSS/cuSPARSE). It is the entry point for all GPU
-// operations on DeviceMatrix.
-//
-// Usage:
-//   GpuContext ctx;                        // explicit context
-//   d_C.device(ctx) = d_A * d_B;          // GEMM on ctx's stream
-//
-//   d_C = d_A * d_B;                      // thread-local default context
-//   GpuContext& ctx = GpuContext::threadLocal();
-
-#ifndef EIGEN_GPU_CONTEXT_H
-#define EIGEN_GPU_CONTEXT_H
-
-// IWYU pragma: private
-#include "./InternalHeaderCheck.h"
-
-#include "./CuBlasSupport.h"
-#include "./CuSolverSupport.h"
-
-namespace Eigen {
-
-/** \ingroup GPU_Module
- * \class GpuContext
- * \brief Unified GPU execution context owning a CUDA stream and library handles.
- *
- * Each GpuContext instance creates a dedicated CUDA stream, a cuBLAS handle,
- * and a cuSOLVER handle, all bound to that stream. Multiple contexts enable
- * concurrent execution on independent streams.
- *
- * A lazily-created thread-local default is available via threadLocal() for
- * simple single-stream usage.
- */
-class GpuContext {
- public:
-  GpuContext() {
-    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
-    EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
-    EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
-    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
-    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
-  }
-
-  ~GpuContext() {
-    if (cusolver_) (void)cusolverDnDestroy(cusolver_);
-    if (cublas_) (void)cublasDestroy(cublas_);
-    if (stream_) (void)cudaStreamDestroy(stream_);
-  }
-
-  // Non-copyable, non-movable (owns library handles).
-  GpuContext(const GpuContext&) = delete;
-  GpuContext& operator=(const GpuContext&) = delete;
-
-  /** Lazily-created thread-local default context. */
-  static GpuContext& threadLocal() {
-    thread_local GpuContext ctx;
-    return ctx;
-  }
-
-  cudaStream_t stream() const { return stream_; }
-  cublasHandle_t cublasHandle() const { return cublas_; }
-  cusolverDnHandle_t cusolverHandle() const { return cusolver_; }
-
- private:
-  cudaStream_t stream_ = nullptr;
-  cublasHandle_t cublas_ = nullptr;
-  cusolverDnHandle_t cusolver_ = nullptr;
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_GPU_CONTEXT_H
--- a/Eigen/src/GPU/GpuLLT.h
+++ b/Eigen/src/GPU/GpuLLT.h
@@ -1,385 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Eigen Authors
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// GPU Cholesky (LLT) decomposition using cuSOLVER.
-//
-// Unlike Eigen's CPU LLT<MatrixType>, GpuLLT keeps the factored Cholesky
-// factor in device memory for the lifetime of the object. Multiple solves
-// against the same factor therefore only transfer the RHS and solution
-// vectors, not the factor itself.
-//
-// Requires CUDA 11.0+ (cusolverDnXpotrf / cusolverDnXpotrs generic API).
-// Requires CUDA 11.4+ (cusolverDnX generic API + cudaMallocAsync).
-//
-// Usage:
-//   GpuLLT<double> llt(A);              // upload A, potrf, L stays on device
-//   if (llt.info() != Success) { ... }
-//   MatrixXd x1 = llt.solve(b1);        // potrs, only b1 transferred
-//   MatrixXd x2 = llt.solve(b2);        // L already on device
-
-#ifndef EIGEN_GPU_LLT_H
-#define EIGEN_GPU_LLT_H
-
-// IWYU pragma: private
-#include "./InternalHeaderCheck.h"
-
-#include "./CuSolverSupport.h"
-#include <vector>
-
-namespace Eigen {
-
-/** \ingroup GPU_Module
- * \class GpuLLT
- * \brief GPU Cholesky (LL^T) decomposition via cuSOLVER
- *
- * \tparam Scalar_  Element type: float, double, complex<float>, complex<double>
- * \tparam UpLo_    Triangle used: Lower (default) or Upper
- *
- * Factorizes a symmetric positive-definite matrix A = LL^H on the GPU and
- * caches the factor L in device memory. Each subsequent solve(B) uploads only
- * B, calls cusolverDnXpotrs, and downloads the result — the factor is not
- * re-transferred.
- *
- * Each GpuLLT object owns a dedicated CUDA stream and cuSOLVER handle,
- * enabling concurrent factorizations from multiple objects on the same host
- * thread.
- */
-template <typename Scalar_, int UpLo_ = Lower>
-class GpuLLT {
- public:
-  using Scalar = Scalar_;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-  using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
-
-  enum { UpLo = UpLo_ };
-
-  // ---- Construction / destruction ------------------------------------------
-
-  /** Default constructor. Does not factorize; call compute() before solve(). */
-  GpuLLT() { init_context(); }
-
-  /** Factor A immediately. Equivalent to GpuLLT llt; llt.compute(A). */
-  template <typename InputType>
-  explicit GpuLLT(const EigenBase<InputType>& A) {
-    init_context();
-    compute(A);
-  }
-
-  ~GpuLLT() {
-    // Ignore errors in destructors — cannot propagate.
-    if (handle_) (void)cusolverDnDestroy(handle_);
-    if (stream_) (void)cudaStreamDestroy(stream_);
-  }
-
-  // Non-copyable (owns device memory and library handles).
-  GpuLLT(const GpuLLT&) = delete;
-  GpuLLT& operator=(const GpuLLT&) = delete;
-
-  // Movable.
-  GpuLLT(GpuLLT&& o) noexcept
-      : stream_(o.stream_),
-        handle_(o.handle_),
-        params_(std::move(o.params_)),
-        d_factor_(std::move(o.d_factor_)),
-        factor_alloc_size_(o.factor_alloc_size_),
-        d_scratch_(std::move(o.d_scratch_)),
-        scratch_size_(o.scratch_size_),
-        h_workspace_(std::move(o.h_workspace_)),
-        n_(o.n_),
-        lda_(o.lda_),
-        info_(o.info_),
-        info_word_(o.info_word_),
-        info_synced_(o.info_synced_) {
-    o.stream_ = nullptr;
-    o.handle_ = nullptr;
-    o.factor_alloc_size_ = 0;
-    o.scratch_size_ = 0;
-    o.n_ = 0;
-    o.info_ = InvalidInput;
-    o.info_word_ = 0;
-    o.info_synced_ = true;
-  }
-
-  GpuLLT& operator=(GpuLLT&& o) noexcept {
-    if (this != &o) {
-      if (handle_) (void)cusolverDnDestroy(handle_);
-      if (stream_) (void)cudaStreamDestroy(stream_);
-      stream_ = o.stream_;
-      handle_ = o.handle_;
-      params_ = std::move(o.params_);
-      d_factor_ = std::move(o.d_factor_);
-      factor_alloc_size_ = o.factor_alloc_size_;
-      d_scratch_ = std::move(o.d_scratch_);
-      scratch_size_ = o.scratch_size_;
-      h_workspace_ = std::move(o.h_workspace_);
-      n_ = o.n_;
-      lda_ = o.lda_;
-      info_ = o.info_;
-      info_word_ = o.info_word_;
-      info_synced_ = o.info_synced_;
-      o.stream_ = nullptr;
-      o.handle_ = nullptr;
-      o.factor_alloc_size_ = 0;
-      o.scratch_size_ = 0;
-      o.n_ = 0;
-      o.info_ = InvalidInput;
-      o.info_word_ = 0;
-      o.info_synced_ = true;
-    }
-    return *this;
-  }
-
-  // ---- Factorization -------------------------------------------------------
-
-  /** Compute the Cholesky factorization of A (host matrix).
-   *
-   * Uploads A to device memory, calls cusolverDnXpotrf, and retains the
-   * factored matrix on device. Any previous factorization is overwritten.
-   */
-  template <typename InputType>
-  GpuLLT& compute(const EigenBase<InputType>& A) {
-    eigen_assert(A.rows() == A.cols());
-    if (!begin_compute(A.rows())) return *this;
-
-    // Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions).
-    const PlainMatrix mat(A.derived());
-    lda_ = static_cast<int64_t>(mat.outerStride());
-    allocate_factor_storage();
-    EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(d_factor_.ptr, mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_));
-
-    factorize();
-    return *this;
-  }
-
-  /** Compute the Cholesky factorization from a device-resident matrix (D2D copy). */
-  GpuLLT& compute(const DeviceMatrix<Scalar>& d_A) {
-    eigen_assert(d_A.rows() == d_A.cols());
-    if (!begin_compute(d_A.rows())) return *this;
-
-    lda_ = static_cast<int64_t>(d_A.outerStride());
-    d_A.waitReady(stream_);
-    allocate_factor_storage();
-    EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(d_factor_.ptr, d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, stream_));
-
-    factorize();
-    return *this;
-  }
-
-  /** Compute the Cholesky factorization from a device matrix (move, no copy). */
-  GpuLLT& compute(DeviceMatrix<Scalar>&& d_A) {
-    eigen_assert(d_A.rows() == d_A.cols());
-    if (!begin_compute(d_A.rows())) return *this;
-
-    lda_ = static_cast<int64_t>(d_A.outerStride());
-    d_A.waitReady(stream_);
-    d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
-
-    factorize();
-    return *this;
-  }
-
-  // ---- Solve ---------------------------------------------------------------
-
-  /** Solve A * X = B using the cached Cholesky factor (host → host).
-   *
-   * Uploads B to device memory, calls cusolverDnXpotrs using the factor
-   * retained from compute(), and returns the solution X on the host.
-   * The factor is not re-transferred; only B goes up and X comes down.
-   *
-   * \pre compute() must have been called and info() == Success.
-   * \returns X such that A * X ≈ B
-   */
-  template <typename Rhs>
-  PlainMatrix solve(const MatrixBase<Rhs>& B) const {
-    const_cast<GpuLLT*>(this)->sync_info();
-    eigen_assert(info_ == Success && "GpuLLT::solve called on a failed or uninitialized factorization");
-    eigen_assert(B.rows() == n_);
-
-    const PlainMatrix rhs(B);
-    const int64_t nrhs = static_cast<int64_t>(rhs.cols());
-    const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
-    DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
-      EIGEN_CUDA_RUNTIME_CHECK(
-          cudaMemcpyAsync(d_x_ptr, rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
-    });
-
-    PlainMatrix X(n_, B.cols());
-    int solve_info = 0;
-    EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
-    EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
-    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
-
-    eigen_assert(solve_info == 0 && "cusolverDnXpotrs reported an error");
-    return X;
-  }
-
-  /** Solve A * X = B with device-resident RHS. Fully async.
-   *
-   * All work is enqueued on this solver's stream. Returns a DeviceMatrix
-   * with a recorded ready event — no host synchronization occurs.
-   * The caller should check info() after compute() to verify the
-   * factorization succeeded; this method does not check.
-   */
-  DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
-    eigen_assert(d_B.rows() == n_);
-    d_B.waitReady(stream_);
-    const int64_t nrhs = static_cast<int64_t>(d_B.cols());
-    const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
-    return solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
-      EIGEN_CUDA_RUNTIME_CHECK(
-          cudaMemcpyAsync(d_x_ptr, d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
-    });
-  }
-
-  // ---- Accessors -----------------------------------------------------------
-
-  /** Returns Success if the last compute() succeeded, NumericalIssue otherwise.
-   * Lazily synchronizes the stream on first call after compute(). */
-  ComputationInfo info() const {
-    const_cast<GpuLLT*>(this)->sync_info();
-    return info_;
-  }
-
-  Index rows() const { return n_; }
-  Index cols() const { return n_; }
-
-  /** Returns the CUDA stream owned by this object.
-   *  Advanced users may submit additional GPU work on this stream
-   *  to overlap with or chain after GpuLLT operations. */
-  cudaStream_t stream() const { return stream_; }
-
- private:
-  cudaStream_t stream_ = nullptr;
-  cusolverDnHandle_t handle_ = nullptr;
-  internal::CusolverParams params_;   // cuSOLVER params (created once, reused)
-  internal::DeviceBuffer d_factor_;   // factored L (or U) on device (grows, never shrinks)
-  size_t factor_alloc_size_ = 0;      // current d_factor_ allocation size
-  internal::DeviceBuffer d_scratch_;  // combined workspace + info word (grows, never shrinks)
-  size_t scratch_size_ = 0;           // current scratch allocation size
-  std::vector<char> h_workspace_;     // host workspace (kept alive until next compute)
-  Index n_ = 0;
-  int64_t lda_ = 0;
-  ComputationInfo info_ = InvalidInput;
-  int info_word_ = 0;        // host-side target for async info download
-  bool info_synced_ = true;  // has the stream been synced for info?
-
-  bool begin_compute(Index rows) {
-    n_ = rows;
-    info_ = InvalidInput;
-    if (n_ == 0) {
-      info_ = Success;
-      return false;
-    }
-    return true;
-  }
-
-  size_t factorBytes() const { return rhsBytes(static_cast<int64_t>(n_), lda_); }
-
-  static size_t rhsBytes(int64_t cols, int64_t outer_stride) {
-    return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
-  }
-
-  void allocate_factor_storage() {
-    size_t needed = factorBytes();
-    if (needed > factor_alloc_size_) {
-      d_factor_ = internal::DeviceBuffer(needed);
-      factor_alloc_size_ = needed;
-    }
-  }
-
-  // Ensure d_scratch_ is at least `workspace_bytes + sizeof(int)`.
-  // Layout: [workspace (workspace_bytes) | info_word (sizeof(int))].
-  // Ensure d_scratch_ can hold workspace_bytes + an aligned info word.
-  // Grows but never shrinks. Syncs the stream before reallocating to
-  // avoid freeing memory that async kernels may still be using.
-  void ensure_scratch(size_t workspace_bytes) {
-    // Round up so the info word is naturally aligned.
-    // 16-byte alignment for optimal GPU memory access.
-    constexpr size_t kAlign = 16;
-    workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
-    size_t needed = workspace_bytes + sizeof(int);
-    if (needed > scratch_size_) {
-      if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
-      d_scratch_ = internal::DeviceBuffer(needed);
-      scratch_size_ = needed;
-    }
-  }
-
-  void* scratch_workspace() const { return d_scratch_.ptr; }
-  int* scratch_info() const {
-    return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
-  }
-
-  template <typename CopyRhs>
-  DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, CopyRhs&& copy_rhs) const {
-    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
-    constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_, ColMajor>::value;
-
-    Scalar* d_x_ptr = nullptr;
-    EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_x_ptr), rhsBytes(nrhs, ldb)));
-    copy_rhs(d_x_ptr);
-
-    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, static_cast<int64_t>(n_), nrhs, dtype,
-                                          d_factor_.ptr, lda_, dtype, d_x_ptr, ldb, scratch_info()));
-
-    DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs), static_cast<Index>(ldb));
-    result.recordReady(stream_);
-    return result;
-  }
-
-  void init_context() {
-    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
-    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
-    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
-    ensure_scratch(0);  // allocate at least the info word
-  }
-
-  // Synchronize stream and interpret the info word. No-op if already synced.
-  void sync_info() {
-    if (!info_synced_) {
-      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
-      info_ = (info_word_ == 0) ? Success : NumericalIssue;
-      info_synced_ = true;
-    }
-  }
-
-  // Run cusolverDnXpotrf on d_factor_ (already on device).
-  // Enqueues factorization + async info download. Does NOT sync.
-  // Workspaces are stored as members to ensure they outlive the async kernels.
-  void factorize() {
-    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
-    constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_, ColMajor>::value;
-
-    info_synced_ = false;
-    info_ = InvalidInput;
-
-    size_t dev_ws_bytes = 0, host_ws_bytes = 0;
-    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(handle_, params_.p, uplo, static_cast<int64_t>(n_), dtype,
-                                                     d_factor_.ptr, lda_, dtype, &dev_ws_bytes, &host_ws_bytes));
-
-    ensure_scratch(dev_ws_bytes);
-    h_workspace_.resize(host_ws_bytes);
-
-    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(
-        handle_, params_.p, uplo, static_cast<int64_t>(n_), dtype, d_factor_.ptr, lda_, dtype, scratch_workspace(),
-        dev_ws_bytes, host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info()));
-
-    // Enqueue async download of info word — sync deferred to info() or solve().
-    EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
-  }
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_GPU_LLT_H
--- a/Eigen/src/GPU/GpuLU.h
+++ b/Eigen/src/GPU/GpuLU.h
@@ -1,371 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Eigen Authors
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// GPU partial-pivoting LU decomposition using cuSOLVER.
-//
-// Wraps cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve).
-// The factored LU matrix and pivot array are kept in device memory for the
-// lifetime of the object, so repeated solves only transfer the RHS/solution.
-//
-// Requires CUDA 11.0+ (cusolverDnX generic API).
-//
-// Usage:
-//   GpuLU<double> lu(A);              // upload A, getrf, LU+ipiv on device
-//   if (lu.info() != Success) { ... }
-//   MatrixXd x = lu.solve(b);         // getrs NoTrans, only b transferred
-//   MatrixXd xt = lu.solve(b, GpuLU<double>::Transpose);   // A^T x = b
-
-#ifndef EIGEN_GPU_LU_H
-#define EIGEN_GPU_LU_H
-
-// IWYU pragma: private
-#include "./InternalHeaderCheck.h"
-
-#include "./CuSolverSupport.h"
-#include <vector>
-
-namespace Eigen {
-
-/** \ingroup GPU_Module
- * \class GpuLU
- * \brief GPU LU decomposition with partial pivoting via cuSOLVER
- *
- * \tparam Scalar_  Element type: float, double, complex<float>, complex<double>
- *
- * Decomposes a square matrix A = P L U on the GPU and retains the factored
- * matrix and pivot array in device memory. Solves A*X=B, A^T*X=B, or
- * A^H*X=B by passing the appropriate TransposeMode.
- *
- * Each GpuLU object owns a dedicated CUDA stream and cuSOLVER handle.
- */
-template <typename Scalar_>
-class GpuLU {
- public:
-  using Scalar = Scalar_;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-  using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
-
-  /** Controls which system is solved in solve(). */
-  enum TransposeMode {
-    NoTranspose,        ///< Solve A   * X = B
-    Transpose,          ///< Solve A^T * X = B
-    ConjugateTranspose  ///< Solve A^H * X = B (same as Transpose for real types)
-  };
-
-  // ---- Construction / destruction ------------------------------------------
-
-  GpuLU() { init_context(); }
-
-  template <typename InputType>
-  explicit GpuLU(const EigenBase<InputType>& A) {
-    init_context();
-    compute(A);
-  }
-
-  ~GpuLU() {
-    if (handle_) (void)cusolverDnDestroy(handle_);
-    if (stream_) (void)cudaStreamDestroy(stream_);
-  }
-
-  GpuLU(const GpuLU&) = delete;
-  GpuLU& operator=(const GpuLU&) = delete;
-
-  GpuLU(GpuLU&& o) noexcept
-      : stream_(o.stream_),
-        handle_(o.handle_),
-        params_(std::move(o.params_)),
-        d_lu_(std::move(o.d_lu_)),
-        lu_alloc_size_(o.lu_alloc_size_),
-        d_ipiv_(std::move(o.d_ipiv_)),
-        d_scratch_(std::move(o.d_scratch_)),
-        scratch_size_(o.scratch_size_),
-        h_workspace_(std::move(o.h_workspace_)),
-        n_(o.n_),
-        lda_(o.lda_),
-        info_(o.info_),
-        info_word_(o.info_word_),
-        info_synced_(o.info_synced_) {
-    o.stream_ = nullptr;
-    o.handle_ = nullptr;
-    o.lu_alloc_size_ = 0;
-    o.scratch_size_ = 0;
-    o.n_ = 0;
-    o.info_ = InvalidInput;
-    o.info_word_ = 0;
-    o.info_synced_ = true;
-  }
-
-  GpuLU& operator=(GpuLU&& o) noexcept {
-    if (this != &o) {
-      if (handle_) (void)cusolverDnDestroy(handle_);
-      if (stream_) (void)cudaStreamDestroy(stream_);
-      stream_ = o.stream_;
-      handle_ = o.handle_;
-      params_ = std::move(o.params_);
-      d_lu_ = std::move(o.d_lu_);
-      lu_alloc_size_ = o.lu_alloc_size_;
-      d_ipiv_ = std::move(o.d_ipiv_);
-      d_scratch_ = std::move(o.d_scratch_);
-      scratch_size_ = o.scratch_size_;
-      h_workspace_ = std::move(o.h_workspace_);
-      n_ = o.n_;
-      lda_ = o.lda_;
-      info_ = o.info_;
-      info_word_ = o.info_word_;
-      info_synced_ = o.info_synced_;
-      o.stream_ = nullptr;
-      o.handle_ = nullptr;
-      o.lu_alloc_size_ = 0;
-      o.scratch_size_ = 0;
-      o.n_ = 0;
-      o.info_ = InvalidInput;
-      o.info_word_ = 0;
-      o.info_synced_ = true;
-    }
-    return *this;
-  }
-
-  // ---- Factorization -------------------------------------------------------
-
-  /** Compute the LU factorization of A (host matrix, must be square). */
-  template <typename InputType>
-  GpuLU& compute(const EigenBase<InputType>& A) {
-    eigen_assert(A.rows() == A.cols() && "GpuLU requires a square matrix");
-    if (!begin_compute(A.rows())) return *this;
-
-    const PlainMatrix mat(A.derived());
-    lda_ = static_cast<int64_t>(mat.outerStride());
-    allocate_lu_storage();
-    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_));
-
-    factorize();
-    return *this;
-  }
-
-  /** Compute the LU factorization from a device-resident matrix (D2D copy). */
-  GpuLU& compute(const DeviceMatrix<Scalar>& d_A) {
-    eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
-    if (!begin_compute(d_A.rows())) return *this;
-
-    lda_ = static_cast<int64_t>(d_A.outerStride());
-    d_A.waitReady(stream_);
-    allocate_lu_storage();
-    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_));
-
-    factorize();
-    return *this;
-  }
-
-  /** Compute the LU factorization from a device matrix (move, no copy). */
-  GpuLU& compute(DeviceMatrix<Scalar>&& d_A) {
-    eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
-    if (!begin_compute(d_A.rows())) return *this;
-
-    lda_ = static_cast<int64_t>(d_A.outerStride());
-    d_A.waitReady(stream_);
-    d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
-
-    factorize();
-    return *this;
-  }
-
-  // ---- Solve ---------------------------------------------------------------
-
-  /** Solve op(A) * X = B using the cached LU factorization (host → host).
-   *
-   * \param B    Right-hand side (n x nrhs host matrix).
-   * \param mode NoTranspose (default), Transpose, or ConjugateTranspose.
-   */
-  template <typename Rhs>
-  PlainMatrix solve(const MatrixBase<Rhs>& B, TransposeMode mode = NoTranspose) const {
-    const_cast<GpuLU*>(this)->sync_info();
-    eigen_assert(info_ == Success && "GpuLU::solve called on a failed or uninitialized factorization");
-    eigen_assert(B.rows() == n_);
-
-    const PlainMatrix rhs(B);
-    const int64_t nrhs = static_cast<int64_t>(rhs.cols());
-    const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
-    DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
-      EIGEN_CUDA_RUNTIME_CHECK(
-          cudaMemcpyAsync(d_x_ptr, rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
-    });
-
-    PlainMatrix X(n_, B.cols());
-    int solve_info = 0;
-    EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(X.data(), d_X.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
-    EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
-    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
-
-    eigen_assert(solve_info == 0 && "cusolverDnXgetrs reported an error");
-    return X;
-  }
-
-  /** Solve op(A) * X = B with device-resident RHS. Fully async. */
-  DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B, TransposeMode mode = NoTranspose) const {
-    eigen_assert(d_B.rows() == n_);
-    d_B.waitReady(stream_);
-    const int64_t nrhs = static_cast<int64_t>(d_B.cols());
-    const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
-    return solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
-      EIGEN_CUDA_RUNTIME_CHECK(
-          cudaMemcpyAsync(d_x_ptr, d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
-    });
-  }
-
-  // ---- Accessors -----------------------------------------------------------
-
-  /** Lazily synchronizes the stream on first call after compute(). */
-  ComputationInfo info() const {
-    const_cast<GpuLU*>(this)->sync_info();
-    return info_;
-  }
-  Index rows() const { return n_; }
-  Index cols() const { return n_; }
-  cudaStream_t stream() const { return stream_; }
-
- private:
-  cudaStream_t stream_ = nullptr;
-  cusolverDnHandle_t handle_ = nullptr;
-  internal::CusolverParams params_;   // cuSOLVER params (created once, reused)
-  internal::DeviceBuffer d_lu_;       // LU factors on device (grows, never shrinks)
-  size_t lu_alloc_size_ = 0;          // current d_lu_ allocation size
-  internal::DeviceBuffer d_ipiv_;     // pivot indices (int64_t) on device
-  internal::DeviceBuffer d_scratch_;  // combined workspace + info word (grows, never shrinks)
-  size_t scratch_size_ = 0;           // current scratch allocation size
-  std::vector<char> h_workspace_;     // host workspace (kept alive until next compute)
-  Index n_ = 0;
-  int64_t lda_ = 0;
-  ComputationInfo info_ = InvalidInput;
-  int info_word_ = 0;        // host-side target for async info download
-  bool info_synced_ = true;  // has the stream been synced for info?
-
-  bool begin_compute(Index rows) {
-    n_ = rows;
-    info_ = InvalidInput;
-    if (n_ == 0) {
-      info_ = Success;
-      return false;
-    }
-    return true;
-  }
-
-  size_t matrixBytes() const { return matrixBytes(static_cast<int64_t>(n_), lda_); }
-
-  static size_t matrixBytes(int64_t cols, int64_t outer_stride) {
-    return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
-  }
-
-  void allocate_lu_storage() {
-    size_t needed = matrixBytes();
-    if (needed > lu_alloc_size_) {
-      d_lu_ = internal::DeviceBuffer(needed);
-      lu_alloc_size_ = needed;
-    }
-  }
-
-  // Ensure d_scratch_ is at least `workspace_bytes + sizeof(int)`.
-  // Layout: [workspace (workspace_bytes) | info_word (sizeof(int))].
-  // Ensure d_scratch_ can hold workspace_bytes + an aligned info word.
-  // Grows but never shrinks. Syncs the stream before reallocating to
-  // avoid freeing memory that async kernels may still be using.
-  void ensure_scratch(size_t workspace_bytes) {
-    constexpr size_t kAlign = 16;
-    workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
-    size_t needed = workspace_bytes + sizeof(int);
-    if (needed > scratch_size_) {
-      if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
-      d_scratch_ = internal::DeviceBuffer(needed);
-      scratch_size_ = needed;
-    }
-  }
-
-  void* scratch_workspace() const { return d_scratch_.ptr; }
-  int* scratch_info() const {
-    return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
-  }
-
-  template <typename CopyRhs>
-  DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, TransposeMode mode, CopyRhs&& copy_rhs) const {
-    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
-    const cublasOperation_t trans = to_cublas_op(mode);
-
-    Scalar* d_x_ptr = nullptr;
-    EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_x_ptr), matrixBytes(nrhs, ldb)));
-    copy_rhs(d_x_ptr);
-
-    EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(handle_, params_.p, trans, static_cast<int64_t>(n_), nrhs, dtype, d_lu_.ptr,
-                                          lda_, static_cast<const int64_t*>(d_ipiv_.ptr), dtype, d_x_ptr, ldb,
-                                          scratch_info()));
-
-    DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs), static_cast<Index>(ldb));
-    result.recordReady(stream_);
-    return result;
-  }
-
-  void init_context() {
-    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
-    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
-    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
-    ensure_scratch(0);  // allocate at least the info word
-  }
-
-  void sync_info() {
-    if (!info_synced_) {
-      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
-      info_ = (info_word_ == 0) ? Success : NumericalIssue;
-      info_synced_ = true;
-    }
-  }
-
-  // Run cusolverDnXgetrf on d_lu_ (already on device). Allocates d_ipiv_.
-  // Enqueues factorization + async info download. Does NOT sync.
-  // Workspaces are stored as members to ensure they outlive the async kernels.
-  void factorize() {
-    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
-    const size_t ipiv_bytes = static_cast<size_t>(n_) * sizeof(int64_t);
-
-    info_synced_ = false;
-    info_ = InvalidInput;
-
-    d_ipiv_ = internal::DeviceBuffer(ipiv_bytes);
-
-    size_t dev_ws_bytes = 0, host_ws_bytes = 0;
-    EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(handle_, params_.p, static_cast<int64_t>(n_),
-                                                     static_cast<int64_t>(n_), dtype, d_lu_.ptr, lda_, dtype,
-                                                     &dev_ws_bytes, &host_ws_bytes));
-
-    ensure_scratch(dev_ws_bytes);
-    h_workspace_.resize(host_ws_bytes);
-
-    EIGEN_CUSOLVER_CHECK(
-        cusolverDnXgetrf(handle_, params_.p, static_cast<int64_t>(n_), static_cast<int64_t>(n_), dtype, d_lu_.ptr, lda_,
-                         static_cast<int64_t*>(d_ipiv_.ptr), dtype, scratch_workspace(), dev_ws_bytes,
-                         host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info()));
-
-    EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
-  }
-
-  static cublasOperation_t to_cublas_op(TransposeMode mode) {
-    switch (mode) {
-      case Transpose:
-        return CUBLAS_OP_T;
-      case ConjugateTranspose:
-        return CUBLAS_OP_C;
-      default:
-        return CUBLAS_OP_N;
-    }
-  }
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_GPU_LU_H
--- a/Eigen/src/GPU/GpuSupport.h
+++ b/Eigen/src/GPU/GpuSupport.h
@@ -1,101 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// Generic CUDA runtime support shared across all GPU library integrations
-// (cuSOLVER, cuBLAS, cuDSS, etc.):
-//   - Error-checking macros
-//   - RAII device buffer
-//
-// Only depends on <cuda_runtime.h>. No NVIDIA library headers.
-
-#ifndef EIGEN_GPU_SUPPORT_H
-#define EIGEN_GPU_SUPPORT_H
-
-// IWYU pragma: private
-#include "./InternalHeaderCheck.h"
-
-#include <cuda_runtime.h>
-
-namespace Eigen {
-namespace internal {
-
-// ---- Error-checking macros --------------------------------------------------
-// These abort (via eigen_assert) on failure. Not for use in destructors.
-
-#define EIGEN_CUDA_RUNTIME_CHECK(expr)                             \
-  do {                                                             \
-    cudaError_t _e = (expr);                                       \
-    eigen_assert(_e == cudaSuccess && "CUDA runtime call failed"); \
-  } while (0)
-
-// ---- RAII: device buffer ----------------------------------------------------
-
-struct DeviceBuffer {
-  void* ptr = nullptr;
-
-  DeviceBuffer() = default;
-
-  explicit DeviceBuffer(size_t bytes) {
-    if (bytes > 0) EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes));
-  }
-
-  ~DeviceBuffer() {
-    if (ptr) (void)cudaFree(ptr);  // destructor: ignore errors
-  }
-
-  // Move-only.
-  DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr) { o.ptr = nullptr; }
-  DeviceBuffer& operator=(DeviceBuffer&& o) noexcept {
-    if (this != &o) {
-      if (ptr) (void)cudaFree(ptr);
-      ptr = o.ptr;
-      o.ptr = nullptr;
-    }
-    return *this;
-  }
-
-  DeviceBuffer(const DeviceBuffer&) = delete;
-  DeviceBuffer& operator=(const DeviceBuffer&) = delete;
-
-  // Adopt an existing device pointer. Caller relinquishes ownership.
-  static DeviceBuffer adopt(void* p) {
-    DeviceBuffer b;
-    b.ptr = p;
-    return b;
-  }
-};
-
-// ---- Scalar → cudaDataType_t ------------------------------------------------
-// Shared by cuBLAS and cuSOLVER. cudaDataType_t is defined in library_types.h
-// which is included transitively by cuda_runtime.h.
-
-template <typename Scalar>
-struct cuda_data_type;
-
-template <>
-struct cuda_data_type<float> {
-  static constexpr cudaDataType_t value = CUDA_R_32F;
-};
-template <>
-struct cuda_data_type<double> {
-  static constexpr cudaDataType_t value = CUDA_R_64F;
-};
-template <>
-struct cuda_data_type<std::complex<float>> {
-  static constexpr cudaDataType_t value = CUDA_C_32F;
-};
-template <>
-struct cuda_data_type<std::complex<double>> {
-  static constexpr cudaDataType_t value = CUDA_C_64F;
-};
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // EIGEN_GPU_SUPPORT_H
--- a/Eigen/src/GPU/InternalHeaderCheck.h
+++ b/Eigen/src/GPU/InternalHeaderCheck.h
@@ -1,3 +0,0 @@
-#ifndef EIGEN_GPU_MODULE_H
-#error "Please include Eigen/GPU instead of including headers inside the src/GPU directory directly."
-#endif
--- a/Eigen/src/GPU/README.md
+++ b/Eigen/src/GPU/README.md
@@ -1,318 +0,0 @@
-# Eigen GPU Module (`Eigen/GPU`)
-
-GPU-accelerated dense linear algebra for Eigen users, dispatching to NVIDIA
-CUDA libraries (cuBLAS, cuSOLVER). Requires CUDA 11.4+. Header-only (link
-against CUDA runtime, cuBLAS, and cuSOLVER).
-
-## Why this module
-
-Eigen is the linear algebra foundation for a large ecosystem of C++ projects
-in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP,
-Open3D), scientific computing (Ceres, Stan), and beyond. Many of these
-projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations
-without dropping down to raw CUDA library APIs. Third-party projects like
-[EigenCuda](https://github.com/NLESC-JCER/EigenCuda) and
-[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically to fill
-this gap, and downstream projects like
-[Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
-[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
-GPU-accelerated solvers through Eigen.
-
-The `Eigen/GPU` module aims to close this gap: Existing Eigen users should be
-able to move performance-critical dense linear algebra to the GPU with minimal
-code changes and without learning CUDA library APIs directly.
-
-## Design philosophy
-
-**CPU and GPU coexist.** There is no global compile-time switch that replaces
-CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers
-explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>` -- and both coexist in
-the same binary. This also lets users keep the factored matrix on device across
-multiple solves, something impossible with compile-time replacement.
-
-**Familiar syntax.** GPU operations use the same expression patterns as CPU
-Eigen. Here is a side-by-side comparison:
-
-```cpp
-// ---- CPU (Eigen) ----               // ---- GPU (Eigen/GPU) ----
-#include <Eigen/Dense>                  #define EIGEN_USE_GPU
-                                        #include <Eigen/GPU>
-
-MatrixXd A = ...;                       auto d_A = DeviceMatrix<double>::fromHost(A);
-MatrixXd B = ...;                       auto d_B = DeviceMatrix<double>::fromHost(B);
-
-MatrixXd C = A * B;                     DeviceMatrix<double> d_C = d_A * d_B;
-MatrixXd X = A.llt().solve(B);          DeviceMatrix<double> d_X = d_A.llt().solve(d_B);
-
-                                        MatrixXd X = d_X.toHost();
-```
-
-The GPU version reads like CPU Eigen with explicit upload/download.
-`operator*` dispatches to cuBLAS GEMM, `.llt().solve()` dispatches to
-cuSOLVER potrf + potrs. Unsupported expressions are compile errors.
-
-**Explicit over implicit.** Host-device transfers, stream management, and
-library handle lifetimes are visible in the API. There are no hidden
-allocations or synchronizations except where documented (e.g., `toHost()` must
-synchronize to deliver data to the host).
-
-## Key concepts
-
-### `DeviceMatrix<Scalar>`
-
-A typed RAII wrapper for a dense column-major matrix in GPU device memory.
-This is the GPU counterpart of Eigen's `MatrixX<Scalar>`. A vector is simply
-a `DeviceMatrix` with one column.
-
-```cpp
-// Upload from host
-auto d_A = DeviceMatrix<double>::fromHost(A);
-
-// Allocate uninitialized
-DeviceMatrix<double> d_C(m, n);
-
-// Download to host
-MatrixXd C = d_C.toHost();
-
-// Async download (returns a future)
-auto transfer = d_C.toHostAsync();
-// ... do other work ...
-MatrixXd C = transfer.get();
-```
-
-`DeviceMatrix` supports expression methods that mirror Eigen's API:
-`adjoint()`, `transpose()`, `triangularView<UpLo>()`,
-`selfadjointView<UpLo>()`, `llt()`, `lu()`. These return lightweight
-expression objects that are evaluated when assigned.
-
-### `GpuContext`
-
-Every GPU operation needs a CUDA stream and library handles (cuBLAS,
-cuSOLVER). `GpuContext` bundles these together.
-
-For simple usage, you don't need to create one -- a per-thread default context
-is created lazily on first use:
-
-```cpp
-// These use the thread-local default context automatically
-d_C = d_A * d_B;
-d_X = d_A.llt().solve(d_B);
-```
-
-For concurrent multi-stream execution, create explicit contexts:
-
-```cpp
-GpuContext ctx1, ctx2;
-d_C1.device(ctx1) = d_A1 * d_B1;   // runs on stream 1
-d_C2.device(ctx2) = d_A2 * d_B2;   // runs on stream 2 (concurrently)
-```
-
-## Usage
-
-### Matrix operations (cuBLAS)
-
-```cpp
-auto d_A = DeviceMatrix<double>::fromHost(A);
-auto d_B = DeviceMatrix<double>::fromHost(B);
-
-// GEMM: C = A * B, C = A^H * B, C = A * B^T, ...
-DeviceMatrix<double> d_C = d_A * d_B;
-d_C = d_A.adjoint() * d_B;
-d_C = d_A * d_B.transpose();
-
-// Scaled and accumulated
-d_C += 2.0 * d_A * d_B;             // alpha=2, beta=1
-d_C.device(ctx) -= d_A * d_B;       // alpha=-1, beta=1 (requires explicit context)
-
-// Triangular solve (TRSM)
-d_X = d_A.triangularView<Lower>().solve(d_B);
-
-// Symmetric/Hermitian multiply (SYMM/HEMM)
-d_C = d_A.selfadjointView<Lower>() * d_B;
-
-// Rank-k update (SYRK/HERK)
-d_C.selfadjointView<Lower>().rankUpdate(d_A);  // C += A * A^H
-```
-
-### Dense solvers (cuSOLVER)
-
-**One-shot expression syntax** -- Convenient, re-factorizes each time:
-
-```cpp
-// Cholesky solve (potrf + potrs)
-d_X = d_A.llt().solve(d_B);
-
-// LU solve (getrf + getrs)
-d_Y = d_A.lu().solve(d_B);
-```
-
-**Cached factorization** -- Factor once, solve many times:
-
-```cpp
-GpuLLT<double> llt;
-llt.compute(d_A);                    // factorize (async)
-if (llt.info() != Success) { ... }   // lazy sync on first info() call
-auto d_X1 = llt.solve(d_B1);        // reuses factor (async)
-auto d_X2 = llt.solve(d_B2);        // reuses factor (async)
-MatrixXd X2 = d_X2.toHost();
-
-// LU with transpose solve
-GpuLU<double> lu;
-lu.compute(d_A);
-auto d_Y = lu.solve(d_B, GpuLU<double>::Transpose);  // A^T Y = B
-```
-
-The cached API keeps the factored matrix on device, avoiding redundant
-host-device transfers and re-factorizations.
-
-### Stream control and async execution
-
-Operations are asynchronous by default. The compute-solve chain runs without
-host synchronization until you need a result on the host:
-
-```
-fromHost(A) --sync-->  compute() --async-->  solve() --async-->  toHost()
-   H2D                  potrf                 potrs                D2H
-                                                                   sync
-```
-
-Mandatory sync points:
- `fromHost()` -- Synchronizes to complete the upload before returning
- `toHost()` / `HostTransfer::get()` -- Must deliver data to host
- `info()` -- Must read the factorization status
-
-**Cross-stream safety** is automatic. `DeviceMatrix` tracks write completion
-via CUDA events. When a matrix written on stream A is read on stream B, the
-module automatically inserts `cudaStreamWaitEvent`. Same-stream operations
-skip the wait (CUDA guarantees in-order execution within a stream).
-
-## Reference
-
-### Supported scalar types
-
-`float`, `double`, `std::complex<float>`, `std::complex<double>`.
-
-### Expression -> library call mapping
-
-| DeviceMatrix expression | Library call | Parameters |
-|---|---|---|
-| `C = A * B` | `cublasGemmEx` | transA=N, transB=N, alpha=1, beta=0 |
-| `C = A.adjoint() * B` | `cublasGemmEx` | transA=C, transB=N |
-| `C = A.transpose() * B` | `cublasGemmEx` | transA=T, transB=N |
-| `C = A * B.adjoint()` | `cublasGemmEx` | transA=N, transB=C |
-| `C = A * B.transpose()` | `cublasGemmEx` | transA=N, transB=T |
-| `C = alpha * A * B` | `cublasGemmEx` | alpha from LHS |
-| `C = A * (alpha * B)` | `cublasGemmEx` | alpha from RHS |
-| `C += A * B` | `cublasGemmEx` | alpha=1, beta=1 |
-| `C.device(ctx) -= A * B` | `cublasGemmEx` | alpha=-1, beta=1 |
-| `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs |
-| `X = A.llt<Upper>().solve(B)` | same | uplo=Upper |
-| `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs |
-| `X = A.triangularView<L>().solve(B)` | `cublasXtrsm` | side=L, uplo, diag=NonUnit |
-| `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
-| `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
-
-### `DeviceMatrix<Scalar>` API
-
-| Method | Sync? | Description |
-|--------|-------|-------------|
-| `DeviceMatrix()` | -- | Empty (0x0) |
-| `DeviceMatrix(rows, cols)` | -- | Allocate uninitialized |
-| `fromHost(matrix, stream)` | yes | Upload from Eigen matrix |
-| `fromHostAsync(ptr, rows, cols, outerStride, stream)` | no | Async upload (caller manages lifetime) |
-| `toHost(stream)` | yes | Synchronous download |
-| `toHostAsync(stream)` | no | Returns `HostTransfer` future |
-| `clone(stream)` | no | Device-to-device deep copy |
-| `resize(rows, cols)` | -- | Discard contents, reallocate |
-| `data()` | -- | Raw device pointer |
-| `rows()`, `cols()` | -- | Dimensions |
-| `sizeInBytes()` | -- | Total device allocation size in bytes |
-| `empty()` | -- | True if 0x0 |
-| `adjoint()` | -- | Adjoint view (GEMM ConjTrans) |
-| `transpose()` | -- | Transpose view (GEMM Trans) |
-| `llt()` / `llt<UpLo>()` | -- | Cholesky expression builder |
-| `lu()` | -- | LU expression builder |
-| `triangularView<UpLo>()` | -- | Triangular view (TRSM) |
-| `selfadjointView<UpLo>()` | -- | Self-adjoint view (SYMM, rankUpdate) |
-| `device(ctx)` | -- | Assignment proxy bound to context |
-
-### `GpuContext`
-
-Unified GPU execution context owning a CUDA stream and library handles.
-
-```cpp
-GpuContext()                                             // Creates dedicated stream + handles
-static GpuContext& threadLocal()                         // Per-thread default (lazy-created)
-
-cudaStream_t       stream()
-cublasHandle_t     cublasHandle()
-cusolverDnHandle_t cusolverHandle()
-```
-
-Non-copyable, non-movable (owns library handles).
-
-### `GpuLLT<Scalar, UpLo>` API
-
-GPU dense Cholesky (LL^T) via cuSOLVER. Caches factor on device.
-
-| Method | Sync? | Description |
-|--------|-------|-------------|
-| `GpuLLT(A)` | deferred | Construct and factorize from host matrix |
-| `compute(host_matrix)` | deferred | Upload and factorize |
-| `compute(DeviceMatrix)` | deferred | D2D copy and factorize |
-| `compute(DeviceMatrix&&)` | deferred | Move-adopt and factorize (no copy) |
-| `solve(host_matrix)` | yes | Solve, return host matrix |
-| `solve(DeviceMatrix)` | no | Solve, return `DeviceMatrix` (async) |
-| `info()` | lazy | Syncs stream on first call, returns `Success` or `NumericalIssue` |
-
-### `GpuLU<Scalar>` API
-
-GPU dense partial-pivoting LU via cuSOLVER. Same pattern as `GpuLLT`, plus
-`TransposeMode` parameter on `solve()` (`NoTranspose`, `Transpose`,
-`ConjugateTranspose`).
-
-### `HostTransfer<Scalar>` API
-
-Future for async device-to-host transfer.
-
-| Method | Description |
-|--------|-------------|
-| `get()` | Block until transfer completes, return host matrix reference. Idempotent. |
-| `ready()` | Non-blocking poll |
-
-### Aliasing
-
-Unlike Eigen's `Matrix`, where omitting `.noalias()` triggers a copy to a
-temporary, DeviceMatrix dispatches directly to NVIDIA library calls which have
-no built-in aliasing protection. All operations are implicitly noalias.
-The caller must ensure operands don't alias the destination for GEMM and TRSM
-(debug asserts catch violations).
-
-## File layout
-
-| File | Depends on | Contents |
-|------|-----------|----------|
-| `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `cuda_data_type<>` |
-| `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` |
-| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM expression wrappers |
-| `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers |
-| `DeviceSolverExpr.h` | `DeviceMatrix.h` | Solver expression wrappers (LLT, LU) |
-| `DeviceDispatch.h` | all above | All dispatch functions + `DeviceAssignment` |
-| `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `GpuContext` |
-| `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>` | cuBLAS error macro, op/compute type maps |
-| `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
-| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
-| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
-
-## Building and testing
-
-```bash
-cmake -G Ninja -B build -S . \
-  -DEIGEN_TEST_CUDA=ON \
-  -DEIGEN_CUDA_COMPUTE_ARCH="70" \
-  -DEIGEN_TEST_CUBLAS=ON \
-  -DEIGEN_TEST_CUSOLVER=ON
-
-cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu gpu_device_matrix
-ctest --test-dir build -R "gpu_cublas|gpu_cusolver|gpu_device" --output-on-failure
-```
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -43,10 +43,3 @@ add_subdirectory(Householder)
 add_subdirectory(Solvers)
 add_subdirectory(Tuning)
 add_subdirectory(BLAS)
-
-# GPU benchmarks have their own CMake project (needs CUDAToolkit).
-# They can also be built standalone: cmake -B build -S benchmarks/GPU
-find_package(CUDAToolkit QUIET)
-if(CUDAToolkit_FOUND)
-  add_subdirectory(GPU)
-endif()
--- a/benchmarks/GPU/CMakeLists.txt
+++ b/benchmarks/GPU/CMakeLists.txt
@@ -1,53 +0,0 @@
-# GPU benchmarks require CUDA runtime + cuSOLVER.
-# Build separately from the main benchmark tree since they need CUDA toolchain.
-#
-# Usage:
-#   cmake -G Ninja -B build-bench-gpu -S benchmarks/GPU \
-#         -DCMAKE_CUDA_ARCHITECTURES=89
-#   cmake --build build-bench-gpu
-#
-# Profiling:
-#   nsys profile --trace=cuda ./build-bench-gpu/bench_gpu_solvers
-#   ncu --set full -o profile ./build-bench-gpu/bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
-
-cmake_minimum_required(VERSION 3.18)
-project(EigenGpuBenchmarks CXX)
-
-find_package(benchmark REQUIRED)
-find_package(CUDAToolkit REQUIRED)
-
-set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
-
-function(eigen_add_gpu_benchmark name source)
-  cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN})
-  if(NOT IS_ABSOLUTE "${source}")
-    set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}")
-  endif()
-  add_executable(${name} ${source})
-  target_include_directories(${name} PRIVATE
-    ${EIGEN_SOURCE_DIR}
-    ${CUDAToolkit_INCLUDE_DIRS})
-  target_link_libraries(${name} PRIVATE
-    benchmark::benchmark benchmark::benchmark_main
-    CUDA::cudart CUDA::cusolver CUDA::cublas)
-  if(BENCH_LIBRARIES)
-    target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
-  endif()
-  target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
-  target_compile_definitions(${name} PRIVATE EIGEN_USE_GPU)
-  if(BENCH_DEFINITIONS)
-    target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
-  endif()
-endfunction()
-
-# Solver benchmarks: LLT/LU compute + solve, host vs device paths, CPU baselines.
-eigen_add_gpu_benchmark(bench_gpu_solvers bench_gpu_solvers.cpp)
-eigen_add_gpu_benchmark(bench_gpu_solvers_float bench_gpu_solvers.cpp DEFINITIONS SCALAR=float)
-
-# Chaining benchmarks: async pipeline efficiency, host-roundtrip vs device chain.
-eigen_add_gpu_benchmark(bench_gpu_chaining bench_gpu_chaining.cpp)
-eigen_add_gpu_benchmark(bench_gpu_chaining_float bench_gpu_chaining.cpp DEFINITIONS SCALAR=float)
-
-# Batching benchmarks: multi-stream concurrency for many small systems.
-eigen_add_gpu_benchmark(bench_gpu_batching bench_gpu_batching.cpp)
-eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITIONS SCALAR=float)
--- a/benchmarks/GPU/bench_gpu_batching.cpp
+++ b/benchmarks/GPU/bench_gpu_batching.cpp
@@ -1,268 +0,0 @@
-// GPU batching benchmarks: multi-stream concurrency for many small solves.
-//
-// Each GpuLLT/GpuLU owns its own CUDA stream. This benchmark measures how
-// well multiple solver instances overlap on the GPU, which is critical for
-// workloads like robotics (many small systems) and SLAM (batched poses).
-//
-// Compares:
-//   1. Sequential: one solver handles all systems one by one
-//   2. Batched: N solvers on N streams, all launched before any sync
-//   3. CPU baseline: Eigen LLT on host
-//
-// For Nsight Systems: batched mode should show overlapping kernels on
-// different streams in the timeline view.
-//
-//   nsys profile --trace=cuda ./bench_gpu_batching
-
-#include <benchmark/benchmark.h>
-
-#include <Eigen/Cholesky>
-#include <Eigen/GPU>
-
-#include <memory>
-#include <vector>
-
-using namespace Eigen;
-
-#ifndef SCALAR
-#define SCALAR double
-#endif
-
-using Scalar = SCALAR;
-using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-
-static Mat make_spd(Index n) {
-  Mat M = Mat::Random(n, n);
-  return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
-}
-
-static void cuda_warmup() {
-  static bool done = false;
-  if (!done) {
-    void* p;
-    cudaMalloc(&p, 1);
-    cudaFree(p);
-    done = true;
-  }
-}
-
-// --------------------------------------------------------------------------
-// Sequential: one solver, N systems solved one after another
-// --------------------------------------------------------------------------
-
-static void BM_Batch_Sequential(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const int batch_size = static_cast<int>(state.range(1));
-
-  // Pre-generate all SPD matrices and RHS vectors.
-  std::vector<Mat> As(batch_size);
-  std::vector<Mat> Bs(batch_size);
-  for (int i = 0; i < batch_size; ++i) {
-    As[i] = make_spd(n);
-    Bs[i] = Mat::Random(n, 1);
-  }
-
-  GpuLLT<Scalar> llt;
-
-  for (auto _ : state) {
-    std::vector<Mat> results(batch_size);
-    for (int i = 0; i < batch_size; ++i) {
-      llt.compute(As[i]);
-      results[i] = llt.solve(Bs[i]);
-    }
-    benchmark::DoNotOptimize(results.back().data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["batch"] = batch_size;
-  state.counters["total_solves"] = batch_size;
-}
-
-// --------------------------------------------------------------------------
-// Sequential with DeviceMatrix (avoid re-upload of A each iteration)
-// --------------------------------------------------------------------------
-
-static void BM_Batch_Sequential_Device(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const int batch_size = static_cast<int>(state.range(1));
-
-  std::vector<Mat> As(batch_size);
-  std::vector<Mat> Bs(batch_size);
-  std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
-  std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
-  for (int i = 0; i < batch_size; ++i) {
-    As[i] = make_spd(n);
-    Bs[i] = Mat::Random(n, 1);
-    d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
-    d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
-  }
-
-  GpuLLT<Scalar> llt;
-
-  for (auto _ : state) {
-    std::vector<Mat> results(batch_size);
-    for (int i = 0; i < batch_size; ++i) {
-      llt.compute(d_As[i]);
-      DeviceMatrix<Scalar> d_X = llt.solve(d_Bs[i]);
-      results[i] = d_X.toHost();
-    }
-    benchmark::DoNotOptimize(results.back().data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["batch"] = batch_size;
-  state.counters["total_solves"] = batch_size;
-}
-
-// --------------------------------------------------------------------------
-// Batched: N solvers on N streams, overlapping execution
-// --------------------------------------------------------------------------
-
-static void BM_Batch_MultiStream(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const int batch_size = static_cast<int>(state.range(1));
-
-  std::vector<Mat> As(batch_size);
-  std::vector<Mat> Bs(batch_size);
-  std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
-  std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
-  for (int i = 0; i < batch_size; ++i) {
-    As[i] = make_spd(n);
-    Bs[i] = Mat::Random(n, 1);
-    d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
-    d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
-  }
-
-  // N solvers = N independent CUDA streams.
-  std::vector<std::unique_ptr<GpuLLT<Scalar>>> solvers(batch_size);
-  for (int i = 0; i < batch_size; ++i) {
-    solvers[i] = std::make_unique<GpuLLT<Scalar>>();
-  }
-
-  for (auto _ : state) {
-    // Phase 1: launch all factorizations (async, different streams).
-    for (int i = 0; i < batch_size; ++i) {
-      solvers[i]->compute(d_As[i]);
-    }
-
-    // Phase 2: launch all solves (async, different streams).
-    std::vector<DeviceMatrix<Scalar>> d_Xs(batch_size);
-    for (int i = 0; i < batch_size; ++i) {
-      d_Xs[i] = solvers[i]->solve(d_Bs[i]);
-    }
-
-    // Phase 3: download all results.
-    std::vector<Mat> results(batch_size);
-    for (int i = 0; i < batch_size; ++i) {
-      results[i] = d_Xs[i].toHost();
-    }
-    benchmark::DoNotOptimize(results.back().data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["batch"] = batch_size;
-  state.counters["streams"] = batch_size;
-  state.counters["total_solves"] = batch_size;
-}
-
-// --------------------------------------------------------------------------
-// Batched with async download (overlap D2H with computation)
-// --------------------------------------------------------------------------
-
-static void BM_Batch_MultiStream_AsyncDownload(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const int batch_size = static_cast<int>(state.range(1));
-
-  std::vector<Mat> As(batch_size);
-  std::vector<Mat> Bs(batch_size);
-  std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
-  std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
-  for (int i = 0; i < batch_size; ++i) {
-    As[i] = make_spd(n);
-    Bs[i] = Mat::Random(n, 1);
-    d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
-    d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
-  }
-
-  std::vector<std::unique_ptr<GpuLLT<Scalar>>> solvers(batch_size);
-  for (int i = 0; i < batch_size; ++i) {
-    solvers[i] = std::make_unique<GpuLLT<Scalar>>();
-  }
-
-  for (auto _ : state) {
-    // Launch all compute + solve.
-    std::vector<DeviceMatrix<Scalar>> d_Xs(batch_size);
-    for (int i = 0; i < batch_size; ++i) {
-      solvers[i]->compute(d_As[i]);
-      d_Xs[i] = solvers[i]->solve(d_Bs[i]);
-    }
-
-    // Enqueue all async downloads.
-    std::vector<HostTransfer<Scalar>> transfers;
-    transfers.reserve(batch_size);
-    for (int i = 0; i < batch_size; ++i) {
-      transfers.push_back(d_Xs[i].toHostAsync());
-    }
-
-    // Collect all results.
-    for (int i = 0; i < batch_size; ++i) {
-      benchmark::DoNotOptimize(transfers[i].get().data());
-    }
-  }
-
-  state.counters["n"] = n;
-  state.counters["batch"] = batch_size;
-  state.counters["streams"] = batch_size;
-  state.counters["total_solves"] = batch_size;
-}
-
-// --------------------------------------------------------------------------
-// CPU baseline: Eigen LLT on host, sequential
-// --------------------------------------------------------------------------
-
-static void BM_Batch_CPU(benchmark::State& state) {
-  const Index n = state.range(0);
-  const int batch_size = static_cast<int>(state.range(1));
-
-  std::vector<Mat> As(batch_size);
-  std::vector<Mat> Bs(batch_size);
-  for (int i = 0; i < batch_size; ++i) {
-    As[i] = make_spd(n);
-    Bs[i] = Mat::Random(n, 1);
-  }
-
-  for (auto _ : state) {
-    std::vector<Mat> results(batch_size);
-    for (int i = 0; i < batch_size; ++i) {
-      LLT<Mat> llt(As[i]);
-      results[i] = llt.solve(Bs[i]);
-    }
-    benchmark::DoNotOptimize(results.back().data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["batch"] = batch_size;
-  state.counters["total_solves"] = batch_size;
-}
-
-// --------------------------------------------------------------------------
-// Registration
-// --------------------------------------------------------------------------
-
-// clang-format off
-// Args: {matrix_size, batch_size}
-// Small matrices with large batches are the interesting case for multi-stream.
-BENCHMARK(BM_Batch_Sequential)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_Batch_Sequential_Device)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_Batch_CPU)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
-
-// Also run larger sizes with moderate batching.
-BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond);
-// clang-format on
--- a/benchmarks/GPU/bench_gpu_chaining.cpp
+++ b/benchmarks/GPU/bench_gpu_chaining.cpp
@@ -1,216 +0,0 @@
-// GPU chaining benchmarks: measure async pipeline efficiency.
-//
-// Compares:
-//   1. Host round-trip per solve (baseline)
-//   2. DeviceMatrix chaining (no host round-trip between solves)
-//   3. Varying chain lengths (1, 2, 4, 8 consecutive solves)
-//
-// For Nsight Systems: look for gaps between kernel launches in the timeline.
-// Host round-trip creates visible idle gaps; chaining should show back-to-back kernels.
-//
-//   nsys profile --trace=cuda,nvtx ./bench_gpu_chaining
-
-#include <benchmark/benchmark.h>
-
-#include <Eigen/Cholesky>
-#include <Eigen/GPU>
-
-using namespace Eigen;
-
-#ifndef SCALAR
-#define SCALAR double
-#endif
-
-using Scalar = SCALAR;
-using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-
-static Mat make_spd(Index n) {
-  Mat M = Mat::Random(n, n);
-  return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
-}
-
-static void cuda_warmup() {
-  static bool done = false;
-  if (!done) {
-    void* p;
-    cudaMalloc(&p, 1);
-    cudaFree(p);
-    done = true;
-  }
-}
-
-// --------------------------------------------------------------------------
-// Baseline: host round-trip between every solve
-// --------------------------------------------------------------------------
-
-static void BM_Chain_HostRoundtrip(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const int chain_len = static_cast<int>(state.range(1));
-
-  Mat A = make_spd(n);
-  Mat B = Mat::Random(n, 1);
-  GpuLLT<Scalar> llt(A);
-
-  for (auto _ : state) {
-    Mat X = B;
-    for (int i = 0; i < chain_len; ++i) {
-      X = llt.solve(X);  // host → device → host each time
-    }
-    benchmark::DoNotOptimize(X.data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["chain"] = chain_len;
-  state.counters["solves/iter"] = chain_len;
-}
-
-// --------------------------------------------------------------------------
-// DeviceMatrix chaining: no host round-trip between solves
-// --------------------------------------------------------------------------
-
-static void BM_Chain_Device(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const int chain_len = static_cast<int>(state.range(1));
-
-  Mat A = make_spd(n);
-  Mat B = Mat::Random(n, 1);
-  GpuLLT<Scalar> llt(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  for (auto _ : state) {
-    DeviceMatrix<Scalar> d_X = llt.solve(d_B);
-    for (int i = 1; i < chain_len; ++i) {
-      d_X = llt.solve(d_X);  // device → device, fully async
-    }
-    Mat X = d_X.toHost();  // single sync at end
-    benchmark::DoNotOptimize(X.data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["chain"] = chain_len;
-  state.counters["solves/iter"] = chain_len;
-}
-
-// --------------------------------------------------------------------------
-// DeviceMatrix chaining with async download (overlap D2H with next iteration)
-// --------------------------------------------------------------------------
-
-static void BM_Chain_DeviceAsync(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const int chain_len = static_cast<int>(state.range(1));
-
-  Mat A = make_spd(n);
-  Mat B = Mat::Random(n, 1);
-  GpuLLT<Scalar> llt(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  for (auto _ : state) {
-    DeviceMatrix<Scalar> d_X = llt.solve(d_B);
-    for (int i = 1; i < chain_len; ++i) {
-      d_X = llt.solve(d_X);
-    }
-    auto transfer = d_X.toHostAsync();
-    Mat X = transfer.get();
-    benchmark::DoNotOptimize(X.data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["chain"] = chain_len;
-  state.counters["solves/iter"] = chain_len;
-}
-
-// --------------------------------------------------------------------------
-// Pure GPU chain (no download — measures kernel-only throughput)
-// --------------------------------------------------------------------------
-
-static void BM_Chain_DeviceNoDownload(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const int chain_len = static_cast<int>(state.range(1));
-
-  Mat A = make_spd(n);
-  Mat B = Mat::Random(n, 1);
-  GpuLLT<Scalar> llt(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  for (auto _ : state) {
-    DeviceMatrix<Scalar> d_X = llt.solve(d_B);
-    for (int i = 1; i < chain_len; ++i) {
-      d_X = llt.solve(d_X);
-    }
-    cudaStreamSynchronize(llt.stream());
-    benchmark::DoNotOptimize(d_X.data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["chain"] = chain_len;
-  state.counters["solves/iter"] = chain_len;
-}
-
-// --------------------------------------------------------------------------
-// Compute + solve chain (full pipeline: factorize, then chain solves)
-// --------------------------------------------------------------------------
-
-static void BM_FullPipeline_Host(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const int chain_len = static_cast<int>(state.range(1));
-
-  Mat A = make_spd(n);
-  Mat B = Mat::Random(n, 1);
-
-  for (auto _ : state) {
-    GpuLLT<Scalar> llt(A);
-    Mat X = B;
-    for (int i = 0; i < chain_len; ++i) {
-      X = llt.solve(X);
-    }
-    benchmark::DoNotOptimize(X.data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["chain"] = chain_len;
-}
-
-static void BM_FullPipeline_Device(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const int chain_len = static_cast<int>(state.range(1));
-
-  Mat A = make_spd(n);
-  Mat B = Mat::Random(n, 1);
-
-  for (auto _ : state) {
-    auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-    auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-    GpuLLT<Scalar> llt;
-    llt.compute(d_A);
-    DeviceMatrix<Scalar> d_X = llt.solve(d_B);
-    for (int i = 1; i < chain_len; ++i) {
-      d_X = llt.solve(d_X);
-    }
-    Mat X = d_X.toHost();
-    benchmark::DoNotOptimize(X.data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["chain"] = chain_len;
-}
-
-// --------------------------------------------------------------------------
-// Registration
-// --------------------------------------------------------------------------
-
-// clang-format off
-// Args: {matrix_size, chain_length}
-BENCHMARK(BM_Chain_HostRoundtrip)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_Chain_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_Chain_DeviceAsync)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_Chain_DeviceNoDownload)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
-
-BENCHMARK(BM_FullPipeline_Host)->ArgsProduct({{256, 1024, 4096}, {1, 4}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_FullPipeline_Device)->ArgsProduct({{256, 1024, 4096}, {1, 4}})->Unit(benchmark::kMicrosecond);
-// clang-format on
--- a/benchmarks/GPU/bench_gpu_solvers.cpp
+++ b/benchmarks/GPU/bench_gpu_solvers.cpp
@@ -1,296 +0,0 @@
-// GPU solver benchmarks: GpuLLT and GpuLU compute + solve throughput.
-//
-// Measures factorization and solve performance for the host-matrix and
-// DeviceMatrix code paths across a range of matrix sizes.
-//
-// For Nsight Systems profiling:
-//   nsys profile --trace=cuda,nvtx ./bench_gpu_solvers
-//
-// For Nsight Compute kernel analysis:
-//   ncu --set full -o profile ./bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
-
-#include <benchmark/benchmark.h>
-
-#include <Eigen/Cholesky>
-#include <Eigen/GPU>
-#include <Eigen/LU>
-
-using namespace Eigen;
-
-#ifndef SCALAR
-#define SCALAR double
-#endif
-
-using Scalar = SCALAR;
-using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-
-// --------------------------------------------------------------------------
-// Helpers
-// --------------------------------------------------------------------------
-
-static Mat make_spd(Index n) {
-  Mat M = Mat::Random(n, n);
-  return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
-}
-
-// CUDA warm-up: ensure the GPU is initialized before timing.
-static void cuda_warmup() {
-  static bool done = false;
-  if (!done) {
-    void* p;
-    cudaMalloc(&p, 1);
-    cudaFree(p);
-    done = true;
-  }
-}
-
-// --------------------------------------------------------------------------
-// GpuLLT benchmarks
-// --------------------------------------------------------------------------
-
-// Factorize from host matrix (includes H2D upload).
-static void BM_GpuLLT_Compute_Host(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  Mat A = make_spd(n);
-  GpuLLT<Scalar> llt;
-
-  for (auto _ : state) {
-    llt.compute(A);
-    if (llt.info() != Success) state.SkipWithError("factorization failed");
-  }
-
-  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
-  state.counters["GFLOPS"] =
-      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
-  state.counters["n"] = n;
-}
-
-// Factorize from DeviceMatrix (D2D copy path).
-static void BM_GpuLLT_Compute_Device(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  Mat A = make_spd(n);
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  GpuLLT<Scalar> llt;
-
-  for (auto _ : state) {
-    llt.compute(d_A);
-    if (llt.info() != Success) state.SkipWithError("factorization failed");
-  }
-
-  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
-  state.counters["GFLOPS"] =
-      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
-  state.counters["n"] = n;
-}
-
-// Factorize from DeviceMatrix (move path, no copy).
-static void BM_GpuLLT_Compute_DeviceMove(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  Mat A = make_spd(n);
-  GpuLLT<Scalar> llt;
-
-  for (auto _ : state) {
-    auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-    llt.compute(std::move(d_A));
-    if (llt.info() != Success) state.SkipWithError("factorization failed");
-  }
-
-  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
-  state.counters["GFLOPS"] =
-      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
-  state.counters["n"] = n;
-}
-
-// Solve from host matrix (H2D + potrs + D2H).
-static void BM_GpuLLT_Solve_Host(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const Index nrhs = state.range(1);
-  Mat A = make_spd(n);
-  Mat B = Mat::Random(n, nrhs);
-  GpuLLT<Scalar> llt(A);
-
-  for (auto _ : state) {
-    Mat X = llt.solve(B);
-    benchmark::DoNotOptimize(X.data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["nrhs"] = nrhs;
-}
-
-// Solve from DeviceMatrix (D2D + potrs, async, toHost at end).
-static void BM_GpuLLT_Solve_Device(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const Index nrhs = state.range(1);
-  Mat A = make_spd(n);
-  Mat B = Mat::Random(n, nrhs);
-  GpuLLT<Scalar> llt(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  for (auto _ : state) {
-    DeviceMatrix<Scalar> d_X = llt.solve(d_B);
-    Mat X = d_X.toHost();
-    benchmark::DoNotOptimize(X.data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["nrhs"] = nrhs;
-}
-
-// Solve staying entirely on device (no toHost — measures pure GPU time).
-static void BM_GpuLLT_Solve_DeviceOnly(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const Index nrhs = state.range(1);
-  Mat A = make_spd(n);
-  Mat B = Mat::Random(n, nrhs);
-  GpuLLT<Scalar> llt(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  for (auto _ : state) {
-    DeviceMatrix<Scalar> d_X = llt.solve(d_B);
-    // Force completion without D2H transfer.
-    cudaStreamSynchronize(llt.stream());
-    benchmark::DoNotOptimize(d_X.data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["nrhs"] = nrhs;
-}
-
-// --------------------------------------------------------------------------
-// GpuLU benchmarks
-// --------------------------------------------------------------------------
-
-static void BM_GpuLU_Compute_Host(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  Mat A = Mat::Random(n, n);
-  GpuLU<Scalar> lu;
-
-  for (auto _ : state) {
-    lu.compute(A);
-    if (lu.info() != Success) state.SkipWithError("factorization failed");
-  }
-
-  double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
-  state.counters["GFLOPS"] =
-      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
-  state.counters["n"] = n;
-}
-
-static void BM_GpuLU_Compute_Device(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  Mat A = Mat::Random(n, n);
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  GpuLU<Scalar> lu;
-
-  for (auto _ : state) {
-    lu.compute(d_A);
-    if (lu.info() != Success) state.SkipWithError("factorization failed");
-  }
-
-  double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
-  state.counters["GFLOPS"] =
-      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
-  state.counters["n"] = n;
-}
-
-static void BM_GpuLU_Solve_Host(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const Index nrhs = state.range(1);
-  Mat A = Mat::Random(n, n);
-  Mat B = Mat::Random(n, nrhs);
-  GpuLU<Scalar> lu(A);
-
-  for (auto _ : state) {
-    Mat X = lu.solve(B);
-    benchmark::DoNotOptimize(X.data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["nrhs"] = nrhs;
-}
-
-static void BM_GpuLU_Solve_Device(benchmark::State& state) {
-  cuda_warmup();
-  const Index n = state.range(0);
-  const Index nrhs = state.range(1);
-  Mat A = Mat::Random(n, n);
-  Mat B = Mat::Random(n, nrhs);
-  GpuLU<Scalar> lu(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  for (auto _ : state) {
-    DeviceMatrix<Scalar> d_X = lu.solve(d_B);
-    Mat X = d_X.toHost();
-    benchmark::DoNotOptimize(X.data());
-  }
-
-  state.counters["n"] = n;
-  state.counters["nrhs"] = nrhs;
-}
-
-// --------------------------------------------------------------------------
-// CPU baselines for comparison
-// --------------------------------------------------------------------------
-
-static void BM_CpuLLT_Compute(benchmark::State& state) {
-  const Index n = state.range(0);
-  Mat A = make_spd(n);
-  LLT<Mat> llt;
-
-  for (auto _ : state) {
-    llt.compute(A);
-    benchmark::DoNotOptimize(llt.matrixLLT().data());
-  }
-
-  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
-  state.counters["GFLOPS"] =
-      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
-  state.counters["n"] = n;
-}
-
-static void BM_CpuLU_Compute(benchmark::State& state) {
-  const Index n = state.range(0);
-  Mat A = Mat::Random(n, n);
-  PartialPivLU<Mat> lu;
-
-  for (auto _ : state) {
-    lu.compute(A);
-    benchmark::DoNotOptimize(lu.matrixLU().data());
-  }
-
-  double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
-  state.counters["GFLOPS"] =
-      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
-  state.counters["n"] = n;
-}
-
-// --------------------------------------------------------------------------
-// Registration
-// --------------------------------------------------------------------------
-
-// clang-format off
-BENCHMARK(BM_GpuLLT_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_GpuLLT_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_GpuLLT_Compute_DeviceMove)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_GpuLLT_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_GpuLLT_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_GpuLLT_Solve_DeviceOnly)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
-
-BENCHMARK(BM_GpuLU_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_GpuLU_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_GpuLU_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_GpuLU_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
-
-BENCHMARK(BM_CpuLLT_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_CpuLU_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
-// clang-format on
--- a/ci/build.linux.gitlab-ci.yml
+++ b/ci/build.linux.gitlab-ci.yml
@@ -197,7 +197,7 @@ build:linux:x86-64:nvhpc-26.1:default:unsupported:
    # Additional flags passed to the cuda compiler.
    EIGEN_CI_CUDA_CXX_FLAGS: ""
    # Compute architectures present in the GitLab CI runners.
-    EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75"
+    EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
    EIGEN_CI_BUILD_TARGET: buildtests_gpu
    EIGEN_CI_TEST_CUDA_CLANG: "off"
    EIGEN_CI_TEST_CUDA_NVC: "off"
@@ -211,20 +211,20 @@ build:linux:x86-64:nvhpc-26.1:default:unsupported:
    # Build on regular linux to limit GPU cost.
    - saas-linux-2xlarge-amd64

-# GCC-11, CUDA-12.2
-build:linux:cuda-12.2:gcc-11:
+# GCC-10, CUDA-12.2
+build:linux:cuda-12.2:gcc-10:
  extends: .build:linux:cuda
-  image: nvidia/cuda:12.2.0-devel-ubuntu22.04
+  image: nvidia/cuda:12.2.0-devel-ubuntu20.04
  variables:
-    EIGEN_CI_C_COMPILER: gcc-11
-    EIGEN_CI_CXX_COMPILER: g++-11
+    EIGEN_CI_C_COMPILER: gcc-10
+    EIGEN_CI_CXX_COMPILER: g++-10

-# Clang-14, CUDA-12.2
-build:linux:cuda-12.2:clang-14:
-  extends: build:linux:cuda-12.2:gcc-11
+# Clang-12, CUDA-12.2
+build:linux:cuda-12.2:clang-12:
+  extends: build:linux:cuda-12.2:gcc-10
  variables:
-    EIGEN_CI_C_COMPILER: clang-14
-    EIGEN_CI_CXX_COMPILER: clang++-14
+    EIGEN_CI_C_COMPILER: clang-12
+    EIGEN_CI_CXX_COMPILER: clang++-12
    EIGEN_CI_TEST_CUDA_CLANG: "on"


@@ -234,7 +234,7 @@ build:linux:cuda-12.2:clang-14:
 # ROCm HIP
 build:linux:rocm-latest:gcc-10:
  extends: .build:linux:cross
-  image: rocm/dev-ubuntu-24.04:6.3.1
+  image: rocm/dev-ubuntu-24.04:latest
  variables:
    EIGEN_CI_C_COMPILER: gcc-10
    EIGEN_CI_CXX_COMPILER: g++-10
--- a/ci/build.windows.gitlab-ci.yml
+++ b/ci/build.windows.gitlab-ci.yml
@@ -55,7 +55,7 @@ build:windows:x86-64:msvc-14.29:avx512dq:
  extends: .build:windows
  variables:
    # Compute architectures present in the GitLab CI runners.
-    EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75"
+    EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
    EIGEN_CI_BUILD_TARGET: buildtests_gpu
    EIGEN_CI_ADDITIONAL_ARGS:
      -DEIGEN_TEST_CUDA=on
@@ -66,8 +66,8 @@ build:windows:x86-64:msvc-14.29:avx512dq:
    - x86-64
    - cuda

-# MSVC 14.29 + CUDA 12.2
-build:windows:x86-64:cuda-12.2:msvc-14.29:
+# MSVC 14.29 + CUDA 11.4
+build:windows:x86-64:cuda-11.4:msvc-14.29:
  extends: .build:windows:cuda
  variables:
-    EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V12_2
+    EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V11_4
--- a/ci/test.linux.gitlab-ci.yml
+++ b/ci/test.linux.gitlab-ci.yml
@@ -265,23 +265,23 @@ test:linux:x86-64:nvhpc-26.1:default:unsupported:
  tags:
    - saas-linux-medium-amd64-gpu-standard

-# GCC-11, CUDA-12.2
-test:linux:cuda-12.2:gcc-11:
+# GCC-10, CUDA-12.2
+test:linux:cuda-12.2:gcc-10:
  extends: .test:linux:cuda
-  image: nvidia/cuda:12.2.0-devel-ubuntu22.04
-  needs: [ build:linux:cuda-12.2:gcc-11 ]
+  image: nvidia/cuda:12.2.0-devel-ubuntu20.04
+  needs: [ build:linux:cuda-12.2:gcc-10 ]
  variables:
-    EIGEN_CI_CXX_COMPILER: g++-11
-    EIGEN_CI_CC_COMPILER: gcc-11
+    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_CC_COMPILER: gcc-10

-# Clang-14, CUDA-12.2
-test:linux:cuda-12.2:clang-14:
+# Clang-12, CUDA-12.2
+test:linux:cuda-12.2:clang-12:
  extends: .test:linux:cuda
-  image: nvidia/cuda:12.2.0-devel-ubuntu22.04
-  needs: [ build:linux:cuda-12.2:clang-14 ]
+  image: nvidia/cuda:12.2.0-devel-ubuntu20.04
+  needs: [ build:linux:cuda-12.2:clang-12 ]
  variables:
-    EIGEN_CI_CXX_COMPILER: clang++-14
-    EIGEN_CI_CC_COMPILER: clang-14
+    EIGEN_CI_CXX_COMPILER: clang++-12
+    EIGEN_CI_CC_COMPILER: clang-12


 ##### arm ######################################################################
--- a/ci/test.windows.gitlab-ci.yml
+++ b/ci/test.windows.gitlab-ci.yml
@@ -71,7 +71,7 @@ test:windows:x86-64:msvc-14.29:avx512dq:unsupported:
    - x86-64
    - cuda

-# MSVC 14.29 + CUDA 12.2
-test:windows:x86-64:cuda-12.2:msvc-14.29:
+# MSVC 14.29 + CUDA 11.4
+test:windows:x86-64:cuda-11.4:msvc-14.29:
  extends: .test:windows:cuda
-  needs: [ build:windows:x86-64:cuda-12.2:msvc-14.29 ]
+  needs: [ build:windows:x86-64:cuda-11.4:msvc-14.29 ]
--- a/cmake/EigenConfigureTesting.cmake
+++ b/cmake/EigenConfigureTesting.cmake
@@ -20,8 +20,7 @@ add_dependencies(check buildtests)

 # Convenience target for only building GPU tests.
 add_custom_target(buildtests_gpu)
-add_custom_target(check_gpu COMMAND "ctest" ${EIGEN_CTEST_ARGS}
-                                            "--output-on-failure"
+add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure"
                                            "--no-compress-output"
                                            "--build-no-clean"
                                            "-T" "test"
@@ -72,3 +71,4 @@ elseif(MSVC)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS")
 endif()

+
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -8,12 +8,6 @@ macro(ei_add_property prop value)
  endif()
 endmacro()

-if(EIGEN_TEST_HIP AND NOT DEFINED EIGEN_HIP_ARCHITECTURES)
-  set(EIGEN_HIP_ARCHITECTURES
-      gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151
-      CACHE STRING "HIP GPU architectures to build Eigen's HIP tests for.")
-endif()
-
 #internal. See documentation of ei_add_test for details.
 macro(ei_add_test_internal testname testname_with_suffix)
  set(targetname ${testname_with_suffix})
@@ -36,7 +30,7 @@ macro(ei_add_test_internal testname testname_with_suffix)
      hip_reset_flags()
      hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS -std=c++14)
      target_compile_definitions(${targetname} PRIVATE -DEIGEN_USE_HIP)
-      set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES "${EIGEN_HIP_ARCHITECTURES}")
+      set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
    elseif(EIGEN_TEST_CUDA_CLANG)
      set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)

@@ -140,7 +134,6 @@ macro(ei_add_test_internal testname testname_with_suffix)
  if (is_gpu_test)
    # Add gpu tag for testing only GPU tests.
    set_property(TEST ${testname_with_suffix} APPEND PROPERTY LABELS "gpu")
-    set_property(TEST ${testname_with_suffix} PROPERTY SKIP_RETURN_CODE 77)
  endif()

  if(EIGEN_SYCL)
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -433,7 +433,7 @@ if(EIGEN_TEST_CUDA_NVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "NVHPC")
  message(WARNING "EIGEN_TEST_CUDA_NVC is set, but CMAKE_CXX_COMPILER does not appear to be nvc++.")
 endif()

-find_package(CUDA 11.4)
+find_package(CUDA 9.0)
 if(CUDA_FOUND AND EIGEN_TEST_CUDA)
  # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
  # and -fno-check-new flags since they trigger thousands of compilation warnings
@@ -479,78 +479,6 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)

  ei_add_test(gpu_example)
  ei_add_test(gpu_basic)
-  ei_add_test(gpu_library_example "" "CUDA::cusolver")
-
-  # DeviceMatrix tests: only CUDA runtime, no NVIDIA libraries.
-  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
-  add_executable(gpu_device_matrix gpu_device_matrix.cpp)
-  target_include_directories(gpu_device_matrix PRIVATE
-    "${CUDA_TOOLKIT_ROOT_DIR}/include"
-    "${CMAKE_CURRENT_BINARY_DIR}")
-  target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart)
-  target_compile_definitions(gpu_device_matrix PRIVATE
-    EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
-    EIGEN_TEST_PART_ALL=1)
-  add_test(NAME gpu_device_matrix COMMAND gpu_device_matrix)
-  add_dependencies(buildtests gpu_device_matrix)
-  add_dependencies(buildtests_gpu gpu_device_matrix)
-  set_property(TEST gpu_device_matrix APPEND PROPERTY LABELS "Official;gpu")
-  set_property(TEST gpu_device_matrix PROPERTY SKIP_RETURN_CODE 77)
-  set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
-
-  # Library-specific GPU tests (activated by later phases, OFF by default).
-  # CUDAToolkit imported targets (CUDA::cublas, etc.) are available from
-  # find_package(CUDAToolkit) above.
-  option(EIGEN_TEST_CUBLAS "Test cuBLAS integration" OFF)
-  if(EIGEN_TEST_CUBLAS AND TARGET CUDA::cublas)
-    # cuBLAS tests are plain .cpp files (no device code), like cuSOLVER tests.
-    unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
-    add_executable(gpu_cublas gpu_cublas.cpp)
-    target_include_directories(gpu_cublas PRIVATE
-      "${CUDA_TOOLKIT_ROOT_DIR}/include"
-      "${CMAKE_CURRENT_BINARY_DIR}")
-    target_link_libraries(gpu_cublas
-      Eigen3::Eigen CUDA::cudart CUDA::cublas CUDA::cusolver)
-    target_compile_definitions(gpu_cublas PRIVATE
-      EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
-      EIGEN_TEST_PART_ALL=1)
-    add_test(NAME gpu_cublas COMMAND gpu_cublas)
-    add_dependencies(buildtests gpu_cublas)
-    add_dependencies(buildtests_gpu gpu_cublas)
-    set_property(TEST gpu_cublas APPEND PROPERTY LABELS "Official;gpu")
-    set_property(TEST gpu_cublas PROPERTY SKIP_RETURN_CODE 77)
-    set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
-  endif()
-
-  option(EIGEN_TEST_CUSOLVER "Test cuSOLVER integration" OFF)
-  if(EIGEN_TEST_CUSOLVER AND TARGET CUDA::cusolver)
-    # cuSOLVER tests are plain .cpp files: no device code, compiled by the host
-    # compiler and linked against CUDA runtime + cuSOLVER. This avoids NVCC
-    # instantiating Eigen's CPU packet operations for CUDA vector types.
-    unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
-    foreach(_cusolver_test IN ITEMS gpu_cusolver_llt gpu_cusolver_lu)
-      add_executable(${_cusolver_test} ${_cusolver_test}.cpp)
-      target_include_directories(${_cusolver_test} PRIVATE
-        "${CUDA_TOOLKIT_ROOT_DIR}/include"
-        "${CMAKE_CURRENT_BINARY_DIR}")
-      target_link_libraries(${_cusolver_test}
-        Eigen3::Eigen CUDA::cudart CUDA::cusolver CUDA::cublas)
-      target_compile_definitions(${_cusolver_test} PRIVATE
-        EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
-        EIGEN_TEST_PART_ALL=1)
-      add_test(NAME ${_cusolver_test} COMMAND "${_cusolver_test}")
-      add_dependencies(buildtests ${_cusolver_test})
-      add_dependencies(buildtests_gpu ${_cusolver_test})
-      set_property(TEST ${_cusolver_test} APPEND PROPERTY LABELS "Official;gpu")
-      set_property(TEST ${_cusolver_test} PROPERTY SKIP_RETURN_CODE 77)
-    endforeach()
-    set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
-  endif()
-
-  option(EIGEN_TEST_CUSPARSE "Test cuSPARSE integration" OFF)
-  if(EIGEN_TEST_CUSPARSE AND TARGET CUDA::cusparse)
-    ei_add_test(gpu_cusparse "" "CUDA::cusparse")
-  endif()

  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)

@@ -574,9 +502,6 @@ if (EIGEN_TEST_HIP)
  endif()

  find_package(HIP REQUIRED)
-  if (HIP_FOUND AND HIP_VERSION VERSION_LESS "5.6")
-    message(FATAL_ERROR "Eigen requires ROCm/HIP >= 5.6, found ${HIP_VERSION}")
-  endif()
  if (HIP_FOUND)
    execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)

--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu
@@ -7,6 +7,12 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

+// workaround issue between gcc >= 4.7 and cuda 5.5
+#if (defined __GNUC__) && (__GNUC__ > 4 || __GNUC_MINOR__ >= 7)
+#undef _GLIBCXX_ATOMIC_BUILTINS
+#undef _GLIBCXX_USE_INT128
+#endif
+
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int

--- a/test/gpu_context.h
+++ b/test/gpu_context.h
@@ -1,72 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_TEST_GPU_CONTEXT_H
-#define EIGEN_TEST_GPU_CONTEXT_H
-
-// RAII context for GPU tests that use NVIDIA library APIs (cuBLAS, cuSOLVER, etc.).
-// Owns a non-default CUDA stream. Library handles (cuBLAS, cuSOLVER, etc.) are added
-// here by each integration phase as needed; each handle is bound to the owned stream.
-//
-// Usage:
-//   GpuContext ctx;
-//   auto buf = gpu_copy_to_device(ctx.stream, A);
-//   // ... call NVIDIA library APIs using ctx.stream / ctx.cusolver ...
-//   ctx.synchronize();
-
-#include "gpu_test_helper.h"
-
-#ifdef EIGEN_USE_GPU
-#include <cusolverDn.h>
-
-// Checks cuSOLVER return codes, aborts on failure.
-#define CUSOLVER_CHECK(expr)                                                                 \
-  do {                                                                                       \
-    cusolverStatus_t _status = (expr);                                                       \
-    if (_status != CUSOLVER_STATUS_SUCCESS) {                                                \
-      printf("cuSOLVER error %d at %s:%d\n", static_cast<int>(_status), __FILE__, __LINE__); \
-      gpu_assert(false);                                                                     \
-    }                                                                                        \
-  } while (0)
-
-struct GpuContext {
-  cudaStream_t stream = nullptr;
-  cusolverDnHandle_t cusolver = nullptr;
-
-  GpuContext() {
-    GPU_CHECK(gpuGetDevice(&device_));
-    GPU_CHECK(gpuGetDeviceProperties(&device_props_, device_));
-    GPU_CHECK(cudaStreamCreate(&stream));
-    CUSOLVER_CHECK(cusolverDnCreate(&cusolver));
-    CUSOLVER_CHECK(cusolverDnSetStream(cusolver, stream));
-  }
-
-  ~GpuContext() {
-    if (cusolver) CUSOLVER_CHECK(cusolverDnDestroy(cusolver));
-    if (stream) GPU_CHECK(cudaStreamDestroy(stream));
-  }
-
-  int device() const { return device_; }
-  const gpuDeviceProp_t& deviceProperties() const { return device_props_; }
-
-  // Wait for all work submitted on this context's stream to complete.
-  void synchronize() { GPU_CHECK(cudaStreamSynchronize(stream)); }
-
-  // Non-copyable, non-movable.
-  GpuContext(const GpuContext&) = delete;
-  GpuContext& operator=(const GpuContext&) = delete;
-
- private:
-  int device_ = 0;
-  gpuDeviceProp_t device_props_;
-};
-
-#endif  // EIGEN_USE_GPU
-
-#endif  // EIGEN_TEST_GPU_CONTEXT_H
--- a/test/gpu_cublas.cpp
+++ b/test/gpu_cublas.cpp
@@ -1,728 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// Tests for cuBLAS GEMM dispatch via DeviceMatrix expression syntax.
-// Covers: d_C = d_A * d_B, adjoint, transpose, scaled, +=, .device(ctx).
-
-#define EIGEN_USE_GPU
-#include "main.h"
-#include <Eigen/GPU>
-
-using namespace Eigen;
-
-// ---- Basic GEMM: C = A * B -------------------------------------------------
-
-template <typename Scalar>
-void test_gemm_basic(Index m, Index n, Index k) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(m, k);
-  Mat B = Mat::Random(k, n);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  // Expression: d_C = d_A * d_B
-  DeviceMatrix<Scalar> d_C;
-  d_C = d_A * d_B;
-
-  Mat C = d_C.toHost();
-  Mat C_ref = A * B;
-
-  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
-  VERIFY((C - C_ref).norm() < tol);
-}
-
-// ---- GEMM with adjoint: C = A^H * B ----------------------------------------
-
-template <typename Scalar>
-void test_gemm_adjoint_lhs(Index m, Index n, Index k) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(k, m);  // A is k×m, A^H is m×k
-  Mat B = Mat::Random(k, n);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  DeviceMatrix<Scalar> d_C;
-  d_C = d_A.adjoint() * d_B;
-
-  Mat C = d_C.toHost();
-  Mat C_ref = A.adjoint() * B;
-
-  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
-  VERIFY((C - C_ref).norm() < tol);
-}
-
-// ---- GEMM with transpose: C = A * B^T --------------------------------------
-
-template <typename Scalar>
-void test_gemm_transpose_rhs(Index m, Index n, Index k) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(m, k);
-  Mat B = Mat::Random(n, k);  // B is n×k, B^T is k×n
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  DeviceMatrix<Scalar> d_C;
-  d_C = d_A * d_B.transpose();
-
-  Mat C = d_C.toHost();
-  Mat C_ref = A * B.transpose();
-
-  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
-  VERIFY((C - C_ref).norm() < tol);
-}
-
-// ---- GEMM with scaled: C = alpha * A * B ------------------------------------
-
-template <typename Scalar>
-void test_gemm_scaled(Index m, Index n, Index k) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(m, k);
-  Mat B = Mat::Random(k, n);
-  Scalar alpha = Scalar(2.5);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  DeviceMatrix<Scalar> d_C;
-  d_C = alpha * d_A * d_B;
-
-  Mat C = d_C.toHost();
-  Mat C_ref = alpha * A * B;
-
-  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
-  VERIFY((C - C_ref).norm() < tol);
-}
-
-// ---- GEMM accumulate: C += A * B (beta=1) -----------------------------------
-
-template <typename Scalar>
-void test_gemm_accumulate(Index m, Index n, Index k) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(m, k);
-  Mat B = Mat::Random(k, n);
-  Mat C_init = Mat::Random(m, n);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-  auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
-
-  d_C += d_A * d_B;
-
-  Mat C = d_C.toHost();
-  Mat C_ref = C_init + A * B;
-
-  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
-  VERIFY((C - C_ref).norm() < tol);
-}
-
-// ---- GEMM accumulate into empty destination ---------------------------------
-
-template <typename Scalar>
-void test_gemm_accumulate_empty(Index m, Index n, Index k) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(m, k);
-  Mat B = Mat::Random(k, n);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-  DeviceMatrix<Scalar> d_C;
-
-  d_C += d_A * d_B;
-
-  Mat C = d_C.toHost();
-  Mat C_ref = A * B;
-
-  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
-  VERIFY((C - C_ref).norm() < tol);
-}
-
-// ---- GEMM subtract: C -= A * B (beta=1, alpha=-1) --------------------------
-
-template <typename Scalar>
-void test_gemm_subtract(Index m, Index n, Index k) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(m, k);
-  Mat B = Mat::Random(k, n);
-  Mat C_init = Mat::Random(m, n);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-  auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
-
-  GpuContext ctx;
-  d_C.device(ctx) -= d_A * d_B;
-
-  Mat C = d_C.toHost();
-  Mat C_ref = C_init - A * B;
-
-  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
-  VERIFY((C - C_ref).norm() < tol);
-}
-
-// ---- GEMM subtract from empty destination -----------------------------------
-
-template <typename Scalar>
-void test_gemm_subtract_empty(Index m, Index n, Index k) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(m, k);
-  Mat B = Mat::Random(k, n);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  GpuContext ctx;
-  DeviceMatrix<Scalar> d_C;
-  d_C.device(ctx) -= d_A * d_B;
-
-  Mat C = d_C.toHost();
-  Mat C_ref = -(A * B);
-
-  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
-  VERIFY((C - C_ref).norm() < tol);
-}
-
-// ---- GEMM with scaled RHS: C = A * (alpha * B) -----------------------------
-
-template <typename Scalar>
-void test_gemm_scaled_rhs(Index m, Index n, Index k) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(m, k);
-  Mat B = Mat::Random(k, n);
-  Scalar alpha = Scalar(3.0);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  DeviceMatrix<Scalar> d_C;
-  d_C = d_A * (alpha * d_B);
-
-  Mat C = d_C.toHost();
-  Mat C_ref = A * (alpha * B);
-
-  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
-  VERIFY((C - C_ref).norm() < tol);
-}
-
-// ---- GEMM dimension mismatch must assert ------------------------------------
-
-template <typename Scalar>
-void test_gemm_dimension_mismatch() {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-
-  Mat A = Mat::Random(8, 5);
-  Mat B = Mat::Random(6, 7);  // inner dimension mismatch
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-  DeviceMatrix<Scalar> d_C;
-
-  VERIFY_RAISES_ASSERT(d_C = d_A * d_B);
-}
-
-// ---- GEMM with explicit GpuContext ------------------------------------------
-
-template <typename Scalar>
-void test_gemm_explicit_context(Index m, Index n, Index k) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(m, k);
-  Mat B = Mat::Random(k, n);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  GpuContext ctx;
-  DeviceMatrix<Scalar> d_C;
-  d_C.device(ctx) = d_A * d_B;
-
-  Mat C = d_C.toHost();
-  Mat C_ref = A * B;
-
-  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
-  VERIFY((C - C_ref).norm() < tol);
-}
-
-// ---- GEMM cross-context reuse of the same destination -----------------------
-
-template <typename Scalar>
-void test_gemm_cross_context_reuse(Index n) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(n, n);
-  Mat B = Mat::Random(n, n);
-  Mat D = Mat::Random(n, n);
-  Mat E = Mat::Random(n, n);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-  auto d_D = DeviceMatrix<Scalar>::fromHost(D);
-  auto d_E = DeviceMatrix<Scalar>::fromHost(E);
-
-  GpuContext ctx1;
-  GpuContext ctx2;
-  DeviceMatrix<Scalar> d_C;
-  d_C.device(ctx1) = d_A * d_B;
-  d_C.device(ctx2) += d_D * d_E;
-
-  Mat C = d_C.toHost();
-  Mat C_ref = A * B + D * E;
-
-  RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm();
-  VERIFY((C - C_ref).norm() < tol);
-}
-
-// ---- GEMM cross-context resize of the destination ---------------------------
-
-template <typename Scalar>
-void test_gemm_cross_context_resize() {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(64, 64);
-  Mat B = Mat::Random(64, 64);
-  Mat D = Mat::Random(32, 16);
-  Mat E = Mat::Random(16, 8);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-  auto d_D = DeviceMatrix<Scalar>::fromHost(D);
-  auto d_E = DeviceMatrix<Scalar>::fromHost(E);
-
-  GpuContext ctx1;
-  GpuContext ctx2;
-  DeviceMatrix<Scalar> d_C;
-  d_C.device(ctx1) = d_A * d_B;
-  d_C.device(ctx2) = d_D * d_E;
-
-  Mat C = d_C.toHost();
-  Mat C_ref = D * E;
-
-  RealScalar tol = RealScalar(16) * NumTraits<Scalar>::epsilon() * C_ref.norm();
-  VERIFY((C - C_ref).norm() < tol);
-}
-
-// ---- GEMM chaining: C = (A * B) then D = C * E -----------------------------
-
-template <typename Scalar>
-void test_gemm_chain(Index n) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(n, n);
-  Mat B = Mat::Random(n, n);
-  Mat E = Mat::Random(n, n);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-  auto d_E = DeviceMatrix<Scalar>::fromHost(E);
-
-  DeviceMatrix<Scalar> d_C;
-  d_C = d_A * d_B;
-  DeviceMatrix<Scalar> d_D;
-  d_D = d_C * d_E;
-
-  Mat D = d_D.toHost();
-  Mat D_ref = (A * B) * E;
-
-  RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * D_ref.norm();
-  VERIFY((D - D_ref).norm() < tol);
-}
-
-// ---- Square identity check: A * I = A ---------------------------------------
-
-template <typename Scalar>
-void test_gemm_identity(Index n) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-
-  Mat A = Mat::Random(n, n);
-  Mat eye = Mat::Identity(n, n);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_I = DeviceMatrix<Scalar>::fromHost(eye);
-
-  DeviceMatrix<Scalar> d_C;
-  d_C = d_A * d_I;
-
-  Mat C = d_C.toHost();
-  VERIFY_IS_APPROX(C, A);
-}
-
-// ---- LLT solve expression: d_X = d_A.llt().solve(d_B) ----------------------
-
-template <typename MatrixType>
-MatrixType make_spd(Index n) {
-  using Scalar = typename MatrixType::Scalar;
-  MatrixType M = MatrixType::Random(n, n);
-  return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
-}
-
-template <typename Scalar>
-void test_llt_solve_expr(Index n, Index nrhs) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = make_spd<Mat>(n);
-  Mat B = Mat::Random(n, nrhs);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  DeviceMatrix<Scalar> d_X;
-  d_X = d_A.llt().solve(d_B);
-
-  Mat X = d_X.toHost();
-  RealScalar residual = (A * X - B).norm() / B.norm();
-  VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
-}
-
-// ---- LLT solve with explicit context ----------------------------------------
-
-template <typename Scalar>
-void test_llt_solve_expr_context(Index n, Index nrhs) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = make_spd<Mat>(n);
-  Mat B = Mat::Random(n, nrhs);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  GpuContext ctx;
-  DeviceMatrix<Scalar> d_X;
-  d_X.device(ctx) = d_A.llt().solve(d_B);
-
-  Mat X = d_X.toHost();
-  RealScalar residual = (A * X - B).norm() / B.norm();
-  VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
-}
-
-// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------
-
-template <typename Scalar>
-void test_lu_solve_expr(Index n, Index nrhs) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(n, n);
-  Mat B = Mat::Random(n, nrhs);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  DeviceMatrix<Scalar> d_X;
-  d_X = d_A.lu().solve(d_B);
-
-  Mat X = d_X.toHost();
-  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
-  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
-}
-
-// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------
-
-template <typename Scalar>
-void test_gemm_then_solve(Index n) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(n, n);
-  Mat D = Mat::Random(n, 1);
-
-  // Make SPD: C = A^H * A + n*I
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  DeviceMatrix<Scalar> d_C;
-  d_C = d_A.adjoint() * d_A;
-
-  // Add n*I on host (no element-wise ops on DeviceMatrix yet).
-  Mat C = d_C.toHost();
-  C += Mat::Identity(n, n) * static_cast<Scalar>(n);
-  d_C = DeviceMatrix<Scalar>::fromHost(C);
-
-  auto d_D = DeviceMatrix<Scalar>::fromHost(D);
-
-  DeviceMatrix<Scalar> d_X;
-  d_X = d_C.llt().solve(d_D);
-
-  Mat X = d_X.toHost();
-  RealScalar residual = (C * X - D).norm() / D.norm();
-  VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
-}
-
-// ---- LLT solve with Upper triangle -----------------------------------------
-
-template <typename Scalar>
-void test_llt_solve_upper(Index n, Index nrhs) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = make_spd<Mat>(n);
-  Mat B = Mat::Random(n, nrhs);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  DeviceMatrix<Scalar> d_X;
-  d_X = d_A.template llt<Upper>().solve(d_B);
-
-  Mat X = d_X.toHost();
-  RealScalar residual = (A * X - B).norm() / B.norm();
-  VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
-}
-
-// ---- LU solve with explicit context -----------------------------------------
-
-template <typename Scalar>
-void test_lu_solve_expr_context(Index n, Index nrhs) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(n, n);
-  Mat B = Mat::Random(n, nrhs);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  GpuContext ctx;
-  DeviceMatrix<Scalar> d_X;
-  d_X.device(ctx) = d_A.lu().solve(d_B);
-
-  Mat X = d_X.toHost();
-  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
-  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
-}
-
-// ---- Zero-nrhs solver expressions ------------------------------------------
-
-template <typename Scalar>
-void test_llt_solve_zero_nrhs(Index n) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-
-  Mat A = make_spd<Mat>(n);
-  Mat B = Mat::Random(n, 0);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  DeviceMatrix<Scalar> d_X;
-  d_X = d_A.llt().solve(d_B);
-
-  VERIFY_IS_EQUAL(d_X.rows(), n);
-  VERIFY_IS_EQUAL(d_X.cols(), 0);
-}
-
-template <typename Scalar>
-void test_lu_solve_zero_nrhs(Index n) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-
-  Mat A = Mat::Random(n, n);
-  Mat B = Mat::Random(n, 0);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  DeviceMatrix<Scalar> d_X;
-  d_X = d_A.lu().solve(d_B);
-
-  VERIFY_IS_EQUAL(d_X.rows(), n);
-  VERIFY_IS_EQUAL(d_X.cols(), 0);
-}
-
-// ---- TRSM: triangularView<UpLo>().solve(B) ----------------------------------
-
-template <typename Scalar, int UpLo>
-void test_trsm(Index n, Index nrhs) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  // Build a well-conditioned triangular matrix.
-  Mat A = Mat::Random(n, n);
-  A.diagonal().array() += static_cast<Scalar>(n);  // ensure non-singular
-  if (UpLo == Lower)
-    A = A.template triangularView<Lower>();
-  else
-    A = A.template triangularView<Upper>();
-
-  Mat B = Mat::Random(n, nrhs);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  DeviceMatrix<Scalar> d_X;
-  d_X = d_A.template triangularView<UpLo>().solve(d_B);
-
-  Mat X = d_X.toHost();
-  RealScalar residual = (A * X - B).norm() / B.norm();
-  VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
-}
-
-// ---- SYMM/HEMM: selfadjointView<UpLo>() * B --------------------------------
-
-template <typename Scalar, int UpLo>
-void test_symm(Index n, Index nrhs) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = make_spd<Mat>(n);  // SPD is also self-adjoint
-  Mat B = Mat::Random(n, nrhs);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  DeviceMatrix<Scalar> d_C;
-  d_C = d_A.template selfadjointView<UpLo>() * d_B;
-
-  Mat C = d_C.toHost();
-  Mat C_ref = A * B;  // A is symmetric, so full multiply == symm
-
-  RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm();
-  VERIFY((C - C_ref).norm() < tol);
-}
-
-// ---- SYRK/HERK: rankUpdate(A) → C = A * A^H --------------------------------
-
-template <typename Scalar>
-void test_syrk(Index n, Index k) {
-  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  Mat A = Mat::Random(n, k);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-
-  DeviceMatrix<Scalar> d_C;
-  d_C.template selfadjointView<Lower>().rankUpdate(d_A);
-
-  Mat C = d_C.toHost();
-  // Only lower triangle is meaningful for SYRK. Compare lower triangle.
-  Mat C_ref = A * A.adjoint();
-
-  // Extract lower triangle for comparison.
-  Mat C_lower = C.template triangularView<Lower>();
-  Mat C_ref_lower = C_ref.template triangularView<Lower>();
-
-  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
-  VERIFY((C_lower - C_ref_lower).norm() < tol);
-}
-
-// ---- Per-scalar driver ------------------------------------------------------
-
-template <typename Scalar>
-void test_scalar() {
-  CALL_SUBTEST(test_gemm_basic<Scalar>(64, 64, 64));
-  CALL_SUBTEST(test_gemm_basic<Scalar>(128, 64, 32));
-  CALL_SUBTEST(test_gemm_basic<Scalar>(1, 1, 1));
-  CALL_SUBTEST(test_gemm_basic<Scalar>(256, 256, 256));
-
-  CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(64, 64, 64));
-  CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(128, 32, 64));
-
-  CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(64, 64, 64));
-  CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(128, 32, 64));
-
-  CALL_SUBTEST(test_gemm_scaled<Scalar>(64, 64, 64));
-  CALL_SUBTEST(test_gemm_scaled_rhs<Scalar>(64, 64, 64));
-  CALL_SUBTEST(test_gemm_accumulate<Scalar>(64, 64, 64));
-  CALL_SUBTEST(test_gemm_accumulate_empty<Scalar>(64, 64, 64));
-  CALL_SUBTEST(test_gemm_subtract<Scalar>(64, 64, 64));
-  CALL_SUBTEST(test_gemm_subtract_empty<Scalar>(64, 64, 64));
-  CALL_SUBTEST(test_gemm_dimension_mismatch<Scalar>());
-  CALL_SUBTEST(test_gemm_explicit_context<Scalar>(64, 64, 64));
-  CALL_SUBTEST(test_gemm_cross_context_reuse<Scalar>(64));
-  CALL_SUBTEST(test_gemm_cross_context_resize<Scalar>());
-  CALL_SUBTEST(test_gemm_chain<Scalar>(64));
-  CALL_SUBTEST(test_gemm_identity<Scalar>(64));
-
-  // Solver expressions — zero-size edge cases (use dedicated tests, not residual-based)
-
-  // Solver expressions
-  CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 1));
-  CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 4));
-  CALL_SUBTEST(test_llt_solve_expr<Scalar>(256, 8));
-  CALL_SUBTEST(test_llt_solve_expr_context<Scalar>(64, 4));
-  CALL_SUBTEST(test_llt_solve_upper<Scalar>(64, 4));
-  CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 1));
-  CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 4));
-  CALL_SUBTEST(test_lu_solve_expr<Scalar>(256, 8));
-  CALL_SUBTEST(test_lu_solve_expr_context<Scalar>(64, 4));
-  CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(64));
-  CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(0));
-  CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(64));
-  CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(0));
-  CALL_SUBTEST(test_gemm_then_solve<Scalar>(64));
-
-  // TRSM
-  CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 1)));
-  CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 4)));
-  CALL_SUBTEST((test_trsm<Scalar, Upper>(64, 4)));
-  CALL_SUBTEST((test_trsm<Scalar, Lower>(256, 8)));
-
-  // SYMM/HEMM
-  CALL_SUBTEST((test_symm<Scalar, Lower>(64, 4)));
-  CALL_SUBTEST((test_symm<Scalar, Upper>(64, 4)));
-  CALL_SUBTEST((test_symm<Scalar, Lower>(128, 8)));
-
-  // SYRK/HERK
-  CALL_SUBTEST(test_syrk<Scalar>(64, 64));
-  CALL_SUBTEST(test_syrk<Scalar>(64, 32));
-  CALL_SUBTEST(test_syrk<Scalar>(128, 64));
-}
-
-// ---- Solver failure mode tests (not templated on Scalar) --------------------
-
-void test_llt_not_spd() {
-  // Negative definite matrix — LLT factorization must fail.
-  MatrixXd A = -MatrixXd::Identity(8, 8);
-  MatrixXd B = MatrixXd::Random(8, 1);
-  auto d_A = DeviceMatrix<double>::fromHost(A);
-  auto d_B = DeviceMatrix<double>::fromHost(B);
-  DeviceMatrix<double> d_X;
-  VERIFY_RAISES_ASSERT(d_X = d_A.llt().solve(d_B));
-}
-
-void test_lu_singular() {
-  // Zero matrix — LU factorization must detect singularity.
-  MatrixXd A = MatrixXd::Zero(8, 8);
-  MatrixXd B = MatrixXd::Random(8, 1);
-  auto d_A = DeviceMatrix<double>::fromHost(A);
-  auto d_B = DeviceMatrix<double>::fromHost(B);
-  DeviceMatrix<double> d_X;
-  VERIFY_RAISES_ASSERT(d_X = d_A.lu().solve(d_B));
-}
-
-EIGEN_DECLARE_TEST(gpu_cublas) {
-  CALL_SUBTEST(test_scalar<float>());
-  CALL_SUBTEST(test_scalar<double>());
-  CALL_SUBTEST(test_scalar<std::complex<float>>());
-  CALL_SUBTEST(test_scalar<std::complex<double>>());
-  CALL_SUBTEST(test_llt_not_spd());
-  CALL_SUBTEST(test_lu_singular());
-}
--- a/test/gpu_cusolver_llt.cpp
+++ b/test/gpu_cusolver_llt.cpp
@@ -1,210 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Eigen Authors
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// Tests for GpuLLT: GPU Cholesky (LL^T) using cuSOLVER.
-// Covers cusolverDnXpotrf (factorization) and cusolverDnXpotrs (solve)
-// for float, double, complex<float>, complex<double>, Lower and Upper.
-
-#define EIGEN_USE_GPU
-#include "main.h"
-#include <Eigen/Cholesky>
-#include <Eigen/GPU>
-
-using namespace Eigen;
-
-// Build a random symmetric positive-definite matrix: A = M^H*M + n*I.
-template <typename MatrixType>
-MatrixType make_spd(Index n) {
-  using Scalar = typename MatrixType::Scalar;
-  MatrixType M = MatrixType::Random(n, n);
-  return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
-}
-
-// Test factorization: L*L^H must reconstruct A to within floating-point tolerance.
-template <typename Scalar, int UpLo>
-void test_potrf(Index n) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  MatrixType A = make_spd<MatrixType>(n);
-
-  GpuLLT<Scalar, UpLo> llt(A);
-  VERIFY_IS_EQUAL(llt.info(), Success);
-
-  // Reconstruct L*L^H and compare to original A.
-  // GpuLLT stores the factor on device; use CPU LLT to get the triangular factor
-  // for reconstruction since GpuLLT does not expose the device-resident factor directly.
-  LLT<MatrixType, UpLo> ref(A);
-  VERIFY_IS_EQUAL(ref.info(), Success);
-  MatrixType A_reconstructed = ref.reconstructedMatrix();
-
-  // Both should equal A to within n*eps*||A||.
-  RealScalar tol = RealScalar(4) * RealScalar(n) * NumTraits<Scalar>::epsilon() * A.norm();
-  VERIFY((A_reconstructed - A).norm() < tol);
-
-  // Smoke-test: llt.solve(b) should return the same result as ref.solve(b).
-  MatrixType b = MatrixType::Random(n, 1);
-  MatrixType x_gpu = llt.solve(b);
-  MatrixType x_cpu = ref.solve(b);
-  VERIFY((x_gpu - x_cpu).norm() < tol);
-}
-
-// Test solve: residual ||A*X - B|| / ||B|| must be small.
-template <typename Scalar, int UpLo>
-void test_potrs(Index n, Index nrhs) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  MatrixType A = make_spd<MatrixType>(n);
-  MatrixType B = MatrixType::Random(n, nrhs);
-
-  GpuLLT<Scalar, UpLo> llt(A);
-  VERIFY_IS_EQUAL(llt.info(), Success);
-
-  MatrixType X = llt.solve(B);
-
-  RealScalar residual = (A * X - B).norm() / B.norm();
-  RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon();
-  VERIFY(residual < tol);
-}
-
-// Test that multiple solves against the same factor all produce correct results.
-// This exercises the key design property: L stays on device across calls.
-template <typename Scalar>
-void test_multiple_solves(Index n) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  MatrixType A = make_spd<MatrixType>(n);
-  GpuLLT<Scalar, Lower> llt(A);
-  VERIFY_IS_EQUAL(llt.info(), Success);
-
-  RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon();
-  for (int k = 0; k < 5; ++k) {
-    MatrixType B = MatrixType::Random(n, 3);
-    MatrixType X = llt.solve(B);
-    RealScalar residual = (A * X - B).norm() / B.norm();
-    VERIFY(residual < tol);
-  }
-}
-
-// Test that GpuLLT correctly detects a non-SPD matrix.
-void test_not_spd() {
-  MatrixXd A = -MatrixXd::Identity(8, 8);  // negative definite
-  GpuLLT<double> llt(A);
-  VERIFY_IS_EQUAL(llt.info(), NumericalIssue);
-}
-
-// ---- DeviceMatrix integration tests -----------------------------------------
-
-// compute(DeviceMatrix) + solve(DeviceMatrix) → toHost
-template <typename Scalar, int UpLo>
-void test_device_matrix_solve(Index n, Index nrhs) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  MatrixType A = make_spd<MatrixType>(n);
-  MatrixType B = MatrixType::Random(n, nrhs);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  GpuLLT<Scalar, UpLo> llt;
-  llt.compute(d_A);
-  VERIFY_IS_EQUAL(llt.info(), Success);
-
-  DeviceMatrix<Scalar> d_X = llt.solve(d_B);
-  MatrixType X = d_X.toHost();
-
-  RealScalar residual = (A * X - B).norm() / B.norm();
-  VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
-}
-
-// compute(DeviceMatrix&&) — move path
-template <typename Scalar>
-void test_device_matrix_move_compute(Index n) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  MatrixType A = make_spd<MatrixType>(n);
-  MatrixType B = MatrixType::Random(n, 1);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  GpuLLT<Scalar, Lower> llt;
-  llt.compute(std::move(d_A));
-  VERIFY_IS_EQUAL(llt.info(), Success);
-
-  // d_A should be empty after move.
-  VERIFY(d_A.empty());
-
-  MatrixType X = llt.solve(B);
-  RealScalar residual = (A * X - B).norm() / B.norm();
-  VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
-}
-
-// Full async chain: compute → solve → solve again with result as RHS → toHost
-template <typename Scalar>
-void test_chaining(Index n) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  MatrixType A = make_spd<MatrixType>(n);
-  MatrixType B = MatrixType::Random(n, 3);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  GpuLLT<Scalar, Lower> llt;
-  llt.compute(d_A);
-  VERIFY_IS_EQUAL(llt.info(), Success);
-
-  // Chain: solve → use result as RHS for another solve
-  DeviceMatrix<Scalar> d_X = llt.solve(d_B);
-  DeviceMatrix<Scalar> d_Y = llt.solve(d_X);
-
-  // Only sync at the very end.
-  MatrixType Y = d_Y.toHost();
-
-  // Verify: Y = A^{-2} * B
-  MatrixType X_ref = LLT<MatrixType, Lower>(A).solve(B);
-  MatrixType Y_ref = LLT<MatrixType, Lower>(A).solve(X_ref);
-
-  RealScalar tol = RealScalar(4) * RealScalar(n) * NumTraits<Scalar>::epsilon() * Y_ref.norm();
-  VERIFY((Y - Y_ref).norm() < tol);
-}
-
-template <typename Scalar>
-void test_scalar() {
-  CALL_SUBTEST((test_potrf<Scalar, Lower>(1)));
-  CALL_SUBTEST((test_potrf<Scalar, Lower>(64)));
-  CALL_SUBTEST((test_potrf<Scalar, Lower>(256)));
-  CALL_SUBTEST((test_potrf<Scalar, Upper>(64)));
-  CALL_SUBTEST((test_potrf<Scalar, Upper>(256)));
-
-  CALL_SUBTEST((test_potrs<Scalar, Lower>(64, 1)));
-  CALL_SUBTEST((test_potrs<Scalar, Lower>(64, 4)));
-  CALL_SUBTEST((test_potrs<Scalar, Lower>(256, 8)));
-  CALL_SUBTEST((test_potrs<Scalar, Upper>(64, 1)));
-  CALL_SUBTEST((test_potrs<Scalar, Upper>(256, 4)));
-
-  CALL_SUBTEST(test_multiple_solves<Scalar>(128));
-
-  CALL_SUBTEST((test_device_matrix_solve<Scalar, Lower>(64, 4)));
-  CALL_SUBTEST((test_device_matrix_solve<Scalar, Upper>(128, 1)));
-  CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));
-  CALL_SUBTEST(test_chaining<Scalar>(64));
-}
-
-EIGEN_DECLARE_TEST(gpu_cusolver_llt) {
-  CALL_SUBTEST(test_scalar<float>());
-  CALL_SUBTEST(test_scalar<double>());
-  CALL_SUBTEST(test_scalar<std::complex<float>>());
-  CALL_SUBTEST(test_scalar<std::complex<double>>());
-  CALL_SUBTEST(test_not_spd());
-}
--- a/test/gpu_cusolver_lu.cpp
+++ b/test/gpu_cusolver_lu.cpp
@@ -1,206 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Eigen Authors
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// Tests for GpuLU: GPU partial-pivoting LU decomposition via cuSOLVER.
-// Covers cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve)
-// for float, double, complex<float>, complex<double>.
-//
-#define EIGEN_USE_GPU
-#include "main.h"
-#include <Eigen/LU>
-#include <Eigen/GPU>
-
-using namespace Eigen;
-
-// ---- Test factorization + NoTrans solve: residual ||A*X - B|| / ||B|| -------
-
-template <typename Scalar>
-void test_getrf(Index n) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  MatrixType A = MatrixType::Random(n, n);
-  MatrixType B = MatrixType::Random(n, 4);
-
-  GpuLU<Scalar> lu(A);
-  VERIFY_IS_EQUAL(lu.info(), Success);
-
-  MatrixType X = lu.solve(B);
-  // Backward error bound for LU: ||A*X - B|| <= O(n*u) * ||A|| * ||X||.
-  // Normalize by ||A||*||X|| rather than ||B|| to be condition-number agnostic.
-  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
-  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
-}
-
-// ---- Test solve: A^T*X = B and A^H*X = B ------------------------------------
-
-template <typename Scalar>
-void test_getrs_trans(Index n) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  MatrixType A = MatrixType::Random(n, n);
-  MatrixType B = MatrixType::Random(n, 3);
-  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
-
-  GpuLU<Scalar> lu(A);
-  VERIFY_IS_EQUAL(lu.info(), Success);
-
-  MatrixType Xt = lu.solve(B, GpuLU<Scalar>::Transpose);
-  VERIFY((A.transpose() * Xt - B).norm() / (A.norm() * Xt.norm()) < tol);
-
-  MatrixType Xc = lu.solve(B, GpuLU<Scalar>::ConjugateTranspose);
-  VERIFY((A.adjoint() * Xc - B).norm() / (A.norm() * Xc.norm()) < tol);
-}
-
-// ---- Test multiple solves reuse the device-resident LU ----------------------
-
-template <typename Scalar>
-void test_multiple_solves(Index n) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  MatrixType A = MatrixType::Random(n, n);
-  GpuLU<Scalar> lu(A);
-  VERIFY_IS_EQUAL(lu.info(), Success);
-
-  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
-  for (int k = 0; k < 5; ++k) {
-    MatrixType B = MatrixType::Random(n, 3);
-    MatrixType X = lu.solve(B);
-    VERIFY((A * X - B).norm() / (A.norm() * X.norm()) < tol);
-  }
-}
-
-// ---- Agreement with CPU PartialPivLU ----------------------------------------
-
-template <typename Scalar>
-void test_vs_cpu(Index n) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  MatrixType A = MatrixType::Random(n, n);
-  MatrixType B = MatrixType::Random(n, 5);
-
-  GpuLU<Scalar> gpu_lu(A);
-  VERIFY_IS_EQUAL(gpu_lu.info(), Success);
-
-  MatrixType X_gpu = gpu_lu.solve(B);
-  MatrixType X_cpu = PartialPivLU<MatrixType>(A).solve(B);
-
-  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
-  VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);
-}
-
-// ---- Singular matrix detection ----------------------------------------------
-
-void test_singular() {
-  MatrixXd A = MatrixXd::Zero(8, 8);
-  GpuLU<double> lu(A);
-  VERIFY_IS_EQUAL(lu.info(), NumericalIssue);
-}
-
-// ---- DeviceMatrix integration tests -----------------------------------------
-
-template <typename Scalar>
-void test_device_matrix_solve(Index n) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  MatrixType A = MatrixType::Random(n, n);
-  MatrixType B = MatrixType::Random(n, 4);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  GpuLU<Scalar> lu;
-  lu.compute(d_A);
-  VERIFY_IS_EQUAL(lu.info(), Success);
-
-  DeviceMatrix<Scalar> d_X = lu.solve(d_B);
-  MatrixType X = d_X.toHost();
-
-  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
-  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
-}
-
-template <typename Scalar>
-void test_device_matrix_move_compute(Index n) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  MatrixType A = MatrixType::Random(n, n);
-  MatrixType B = MatrixType::Random(n, 1);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  GpuLU<Scalar> lu;
-  lu.compute(std::move(d_A));
-  VERIFY_IS_EQUAL(lu.info(), Success);
-  VERIFY(d_A.empty());
-
-  MatrixType X = lu.solve(B);
-  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
-  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
-}
-
-template <typename Scalar>
-void test_chaining(Index n) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  using RealScalar = typename NumTraits<Scalar>::Real;
-
-  MatrixType A = MatrixType::Random(n, n);
-  MatrixType B = MatrixType::Random(n, 3);
-
-  auto d_A = DeviceMatrix<Scalar>::fromHost(A);
-  auto d_B = DeviceMatrix<Scalar>::fromHost(B);
-
-  GpuLU<Scalar> lu;
-  lu.compute(d_A);
-  VERIFY_IS_EQUAL(lu.info(), Success);
-
-  // Chain: solve → use result as RHS
-  DeviceMatrix<Scalar> d_X = lu.solve(d_B);
-  DeviceMatrix<Scalar> d_Y = lu.solve(d_X);
-  MatrixType Y = d_Y.toHost();
-
-  MatrixType X_ref = PartialPivLU<MatrixType>(A).solve(B);
-  MatrixType Y_ref = PartialPivLU<MatrixType>(A).solve(X_ref);
-
-  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon() * Y_ref.norm();
-  VERIFY((Y - Y_ref).norm() < tol);
-}
-
-// ---- Per-scalar driver -------------------------------------------------------
-
-template <typename Scalar>
-void test_scalar() {
-  CALL_SUBTEST(test_getrf<Scalar>(1));
-  CALL_SUBTEST(test_getrf<Scalar>(64));
-  CALL_SUBTEST(test_getrf<Scalar>(256));
-
-  CALL_SUBTEST(test_getrs_trans<Scalar>(64));
-  CALL_SUBTEST(test_getrs_trans<Scalar>(128));
-
-  CALL_SUBTEST(test_multiple_solves<Scalar>(128));
-
-  CALL_SUBTEST(test_vs_cpu<Scalar>(64));
-  CALL_SUBTEST(test_vs_cpu<Scalar>(256));
-
-  CALL_SUBTEST(test_device_matrix_solve<Scalar>(64));
-  CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));
-  CALL_SUBTEST(test_chaining<Scalar>(64));
-}
-
-EIGEN_DECLARE_TEST(gpu_cusolver_lu) {
-  CALL_SUBTEST(test_scalar<float>());
-  CALL_SUBTEST(test_scalar<double>());
-  CALL_SUBTEST(test_scalar<std::complex<float>>());
-  CALL_SUBTEST(test_scalar<std::complex<double>>());
-  CALL_SUBTEST(test_singular());
-}
--- a/test/gpu_device_matrix.cpp
+++ b/test/gpu_device_matrix.cpp
@@ -1,247 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// Tests for DeviceMatrix and HostTransfer: typed RAII GPU memory wrapper.
-// No cuSOLVER dependency — only CUDA runtime.
-
-#define EIGEN_USE_GPU
-#include "main.h"
-#include <Eigen/GPU>
-
-using namespace Eigen;
-
-// ---- Default construction ---------------------------------------------------
-
-void test_default_construct() {
-  DeviceMatrix<double> dm;
-  VERIFY(dm.empty());
-  VERIFY_IS_EQUAL(dm.rows(), 0);
-  VERIFY_IS_EQUAL(dm.cols(), 0);
-  VERIFY(dm.data() == nullptr);
-  VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(0));
-}
-
-// ---- Allocate uninitialized -------------------------------------------------
-
-template <typename Scalar>
-void test_allocate(Index rows, Index cols) {
-  DeviceMatrix<Scalar> dm(rows, cols);
-  VERIFY(!dm.empty());
-  VERIFY_IS_EQUAL(dm.rows(), rows);
-  VERIFY_IS_EQUAL(dm.cols(), cols);
-  VERIFY_IS_EQUAL(dm.outerStride(), rows);
-  VERIFY(dm.data() != nullptr);
-  VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar));
-}
-
-// ---- fromHost / toHost roundtrip (synchronous) ------------------------------
-
-template <typename Scalar>
-void test_roundtrip(Index rows, Index cols) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  MatrixType host = MatrixType::Random(rows, cols);
-
-  auto dm = DeviceMatrix<Scalar>::fromHost(host);
-  VERIFY_IS_EQUAL(dm.rows(), rows);
-  VERIFY_IS_EQUAL(dm.cols(), cols);
-  VERIFY(!dm.empty());
-
-  MatrixType result = dm.toHost();
-  VERIFY_IS_EQUAL(result.rows(), rows);
-  VERIFY_IS_EQUAL(result.cols(), cols);
-  VERIFY_IS_APPROX(result, host);
-}
-
-// ---- fromHostAsync / toHostAsync roundtrip -----------------------------------
-
-template <typename Scalar>
-void test_roundtrip_async(Index rows, Index cols) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  MatrixType host = MatrixType::Random(rows, cols);
-
-  cudaStream_t stream;
-  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream));
-
-  // Async upload from raw pointer.
-  auto dm = DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, stream);
-  VERIFY_IS_EQUAL(dm.rows(), rows);
-  VERIFY_IS_EQUAL(dm.cols(), cols);
-
-  // Async download via HostTransfer future.
-  auto transfer = dm.toHostAsync(stream);
-
-  // get() blocks and returns the matrix.
-  MatrixType result = transfer.get();
-  VERIFY_IS_APPROX(result, host);
-
-  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(stream));
-}
-
-// ---- HostTransfer::ready() and idempotent get() -----------------------------
-
-void test_host_transfer_ready() {
-  using MatrixType = Matrix<double, Dynamic, Dynamic>;
-  MatrixType host = MatrixType::Random(100, 100);
-
-  auto dm = DeviceMatrix<double>::fromHost(host);
-  auto transfer = dm.toHostAsync();
-
-  // After get(), ready() must return true.
-  MatrixType result = transfer.get();
-  VERIFY(transfer.ready());
-  VERIFY_IS_APPROX(result, host);
-
-  // get() is idempotent.
-  MatrixType& result2 = transfer.get();
-  VERIFY_IS_APPROX(result2, host);
-}
-
-// ---- HostTransfer move ------------------------------------------------------
-
-void test_host_transfer_move() {
-  using MatrixType = Matrix<double, Dynamic, Dynamic>;
-  MatrixType host = MatrixType::Random(50, 50);
-
-  auto dm = DeviceMatrix<double>::fromHost(host);
-  auto transfer = dm.toHostAsync();
-
-  HostTransfer<double> moved(std::move(transfer));
-  MatrixType result = moved.get();
-  VERIFY_IS_APPROX(result, host);
-}
-
-// ---- clone() produces independent copy --------------------------------------
-
-template <typename Scalar>
-void test_clone(Index rows, Index cols) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  MatrixType host = MatrixType::Random(rows, cols);
-
-  auto dm = DeviceMatrix<Scalar>::fromHost(host);
-  auto cloned = dm.clone();
-
-  // Overwrite original with different data.
-  MatrixType other = MatrixType::Random(rows, cols);
-  dm = DeviceMatrix<Scalar>::fromHost(other);
-
-  // Clone still holds the original data.
-  MatrixType clone_result = cloned.toHost();
-  VERIFY_IS_APPROX(clone_result, host);
-
-  // Original holds the new data.
-  MatrixType dm_result = dm.toHost();
-  VERIFY_IS_APPROX(dm_result, other);
-}
-
-// ---- Move construct ---------------------------------------------------------
-
-template <typename Scalar>
-void test_move_construct(Index rows, Index cols) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  MatrixType host = MatrixType::Random(rows, cols);
-
-  auto dm = DeviceMatrix<Scalar>::fromHost(host);
-  DeviceMatrix<Scalar> moved(std::move(dm));
-
-  VERIFY(dm.empty());
-  VERIFY(dm.data() == nullptr);
-
-  VERIFY_IS_EQUAL(moved.rows(), rows);
-  VERIFY_IS_EQUAL(moved.cols(), cols);
-  MatrixType result = moved.toHost();
-  VERIFY_IS_APPROX(result, host);
-}
-
-// ---- Move assign ------------------------------------------------------------
-
-template <typename Scalar>
-void test_move_assign(Index rows, Index cols) {
-  using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
-  MatrixType host = MatrixType::Random(rows, cols);
-
-  auto dm = DeviceMatrix<Scalar>::fromHost(host);
-  DeviceMatrix<Scalar> dest;
-  dest = std::move(dm);
-
-  VERIFY(dm.empty());
-  VERIFY_IS_EQUAL(dest.rows(), rows);
-  MatrixType result = dest.toHost();
-  VERIFY_IS_APPROX(result, host);
-}
-
-// ---- resize() ---------------------------------------------------------------
-
-void test_resize() {
-  DeviceMatrix<double> dm(10, 20);
-  VERIFY_IS_EQUAL(dm.rows(), 10);
-  VERIFY_IS_EQUAL(dm.cols(), 20);
-
-  dm.resize(50, 30);
-  VERIFY_IS_EQUAL(dm.rows(), 50);
-  VERIFY_IS_EQUAL(dm.cols(), 30);
-  VERIFY_IS_EQUAL(dm.outerStride(), 50);
-  VERIFY(dm.data() != nullptr);
-
-  // Resize to same dimensions is a no-op.
-  double* ptr_before = dm.data();
-  dm.resize(50, 30);
-  VERIFY(dm.data() == ptr_before);
-}
-
-// ---- Empty / 0x0 matrix -----------------------------------------------------
-
-void test_empty() {
-  using MatrixType = Matrix<double, Dynamic, Dynamic>;
-  MatrixType empty_mat(0, 0);
-
-  auto dm = DeviceMatrix<double>::fromHost(empty_mat);
-  VERIFY(dm.empty());
-  VERIFY_IS_EQUAL(dm.rows(), 0);
-  VERIFY_IS_EQUAL(dm.cols(), 0);
-
-  MatrixType result = dm.toHost();
-  VERIFY_IS_EQUAL(result.rows(), 0);
-  VERIFY_IS_EQUAL(result.cols(), 0);
-}
-
-// ---- Per-scalar driver ------------------------------------------------------
-
-template <typename Scalar>
-void test_scalar() {
-  // Square.
-  CALL_SUBTEST(test_roundtrip<Scalar>(1, 1));
-  CALL_SUBTEST(test_roundtrip<Scalar>(64, 64));
-  CALL_SUBTEST(test_roundtrip<Scalar>(256, 256));
-
-  // Rectangular.
-  CALL_SUBTEST(test_roundtrip<Scalar>(100, 7));
-  CALL_SUBTEST(test_roundtrip<Scalar>(7, 100));
-
-  // Async roundtrip.
-  CALL_SUBTEST(test_roundtrip_async<Scalar>(64, 64));
-  CALL_SUBTEST(test_roundtrip_async<Scalar>(100, 7));
-
-  CALL_SUBTEST(test_clone<Scalar>(64, 64));
-  CALL_SUBTEST(test_move_construct<Scalar>(64, 64));
-  CALL_SUBTEST(test_move_assign<Scalar>(64, 64));
-}
-
-EIGEN_DECLARE_TEST(gpu_device_matrix) {
-  CALL_SUBTEST(test_default_construct());
-  CALL_SUBTEST(test_empty());
-  CALL_SUBTEST(test_resize());
-  CALL_SUBTEST(test_host_transfer_ready());
-  CALL_SUBTEST(test_host_transfer_move());
-  CALL_SUBTEST((test_allocate<float>(100, 50)));
-  CALL_SUBTEST((test_allocate<double>(100, 50)));
-  CALL_SUBTEST(test_scalar<float>());
-  CALL_SUBTEST(test_scalar<double>());
-  CALL_SUBTEST(test_scalar<std::complex<float>>());
-  CALL_SUBTEST(test_scalar<std::complex<double>>());
-}
--- a/test/gpu_library_example.cu
+++ b/test/gpu_library_example.cu
@@ -1,110 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// Smoke test for GPU library test infrastructure.
-// Verifies GpuContext, GpuBuffer, and host<->device matrix transfers
-// without requiring any NVIDIA library (cuBLAS, cuSOLVER, etc.).
-
-#define EIGEN_USE_GPU
-#include "main.h"
-#include "gpu_context.h"
-#include "gpu_library_test_helper.h"
-
-using namespace Eigen;
-using namespace Eigen::test;
-
-// Test that GpuContext initializes, reports valid device info, and owns a cuSOLVER handle.
-void test_gpu_context() {
-  GpuContext ctx;
-  VERIFY(ctx.device() >= 0);
-  VERIFY(ctx.deviceProperties().major >= 7);  // sm_70 minimum
-  VERIFY(ctx.stream != nullptr);
-  VERIFY(ctx.cusolver != nullptr);
-  std::cout << "  GPU: " << ctx.deviceProperties().name << " (sm_" << ctx.deviceProperties().major
-            << ctx.deviceProperties().minor << ")\n";
-}
-
-// Test dense matrix roundtrip: host -> device -> host.
-template <typename MatrixType>
-void test_dense_roundtrip() {
-  GpuContext ctx;
-  const Index rows = 64;
-  const Index cols = 32;
-
-  MatrixType A = MatrixType::Random(rows, cols);
-  auto buf = gpu_copy_to_device(ctx.stream, A);
-  VERIFY(buf.data != nullptr);
-  VERIFY(buf.size == rows * cols);
-
-  MatrixType B(rows, cols);
-  B.setZero();
-  gpu_copy_to_host(ctx.stream, buf, B);
-  ctx.synchronize();
-
-  VERIFY_IS_EQUAL(A, B);
-}
-
-// Test GpuBuffer RAII: move semantics, async zero-init.
-void test_gpu_buffer() {
-  GpuContext ctx;
-
-  GpuBuffer<float> a(128);
-  VERIFY(a.data != nullptr);
-  VERIFY(a.size == 128);
-
-  // Move construction.
-  GpuBuffer<float> b(std::move(a));
-  VERIFY(a.data == nullptr);
-  VERIFY(b.data != nullptr);
-  VERIFY(b.size == 128);
-
-  // Move assignment.
-  GpuBuffer<float> c;
-  c = std::move(b);
-  VERIFY(b.data == nullptr);
-  VERIFY(c.data != nullptr);
-
-  // setZeroAsync.
-  c.setZeroAsync(ctx.stream);
-  ctx.synchronize();
-
-  std::vector<float> host(128);
-  GPU_CHECK(cudaMemcpy(host.data(), c.data, 128 * sizeof(float), cudaMemcpyDeviceToHost));
-  for (int i = 0; i < 128; ++i) {
-    VERIFY_IS_EQUAL(host[i], 0.0f);
-  }
-}
-
-// Test with vectors (1D).
-template <typename Scalar>
-void test_vector_roundtrip() {
-  GpuContext ctx;
-  const Index n = 256;
-  Matrix<Scalar, Dynamic, 1> v = Matrix<Scalar, Dynamic, 1>::Random(n);
-  auto buf = gpu_copy_to_device(ctx.stream, v);
-
-  Matrix<Scalar, Dynamic, 1> w(n);
-  w.setZero();
-  gpu_copy_to_host(ctx.stream, buf, w);
-  ctx.synchronize();
-
-  VERIFY_IS_EQUAL(v, w);
-}
-
-EIGEN_DECLARE_TEST(gpu_library_example) {
-  CALL_SUBTEST(test_gpu_context());
-  CALL_SUBTEST(test_gpu_buffer());
-  CALL_SUBTEST(test_dense_roundtrip<MatrixXf>());
-  CALL_SUBTEST(test_dense_roundtrip<MatrixXd>());
-  CALL_SUBTEST((test_dense_roundtrip<Matrix<float, Dynamic, Dynamic, RowMajor>>()));
-  CALL_SUBTEST((test_dense_roundtrip<Matrix<double, Dynamic, Dynamic, RowMajor>>()));
-  CALL_SUBTEST(test_vector_roundtrip<float>());
-  CALL_SUBTEST(test_vector_roundtrip<double>());
-  CALL_SUBTEST(test_vector_roundtrip<std::complex<float>>());
-}
--- a/test/gpu_library_test_helper.h
+++ b/test/gpu_library_test_helper.h
@@ -1,90 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
-#define EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
-
-// Helpers for GPU tests that call NVIDIA library APIs (cuBLAS, cuSOLVER, etc.)
-// from the host side. Provides RAII GPU memory management and async matrix transfer.
-//
-// This is separate from gpu_common.h (element-parallel device kernels) and
-// gpu_test_helper.h (serialization-based device kernels). Those patterns run
-// user functors inside GPU kernels. This helper is for host-orchestrated tests
-// that call library APIs which launch their own kernels internally.
-//
-// All transfers use an explicit stream and cudaMemcpyAsync. Callers must
-// synchronize (ctx.synchronize() or cudaStreamSynchronize) before reading
-// results back on the host.
-
-#include "gpu_test_helper.h"
-
-namespace Eigen {
-namespace test {
-
-// RAII wrapper for GPU device memory. Prevents leaks when VERIFY macros abort.
-template <typename Scalar>
-struct GpuBuffer {
-  Scalar* data = nullptr;
-  Index size = 0;
-
-  GpuBuffer() = default;
-
-  explicit GpuBuffer(Index n) : size(n) { GPU_CHECK(gpuMalloc(reinterpret_cast<void**>(&data), n * sizeof(Scalar))); }
-
-  ~GpuBuffer() {
-    if (data) GPU_CHECK(gpuFree(data));
-  }
-
-  // Move-only.
-  GpuBuffer(GpuBuffer&& other) noexcept : data(other.data), size(other.size) {
-    other.data = nullptr;
-    other.size = 0;
-  }
-  GpuBuffer& operator=(GpuBuffer&& other) noexcept {
-    if (this != &other) {
-      if (data) GPU_CHECK(gpuFree(data));
-      data = other.data;
-      size = other.size;
-      other.data = nullptr;
-      other.size = 0;
-    }
-    return *this;
-  }
-
-  GpuBuffer(const GpuBuffer&) = delete;
-  GpuBuffer& operator=(const GpuBuffer&) = delete;
-
-  // Async zero the buffer on the given stream.
-  void setZeroAsync(cudaStream_t stream) { GPU_CHECK(cudaMemsetAsync(data, 0, size * sizeof(Scalar), stream)); }
-};
-
-// Copy a dense Eigen matrix to a new GPU buffer, async on the given stream.
-// Caller must synchronize before the host matrix is freed or modified.
-template <typename Derived>
-GpuBuffer<typename Derived::Scalar> gpu_copy_to_device(cudaStream_t stream, const MatrixBase<Derived>& host_mat) {
-  using Scalar = typename Derived::Scalar;
-  const auto& mat = host_mat.derived();
-  GpuBuffer<Scalar> buf(mat.size());
-  GPU_CHECK(cudaMemcpyAsync(buf.data, mat.data(), mat.size() * sizeof(Scalar), cudaMemcpyHostToDevice, stream));
-  return buf;
-}
-
-// Copy GPU buffer contents back to a dense Eigen matrix, async on the given stream.
-// Caller must synchronize before reading from host_mat.
-template <typename Scalar, typename Derived>
-void gpu_copy_to_host(cudaStream_t stream, const GpuBuffer<Scalar>& buf, MatrixBase<Derived>& host_mat) {
-  auto& mat = host_mat.derived();
-  eigen_assert(buf.size == mat.size());
-  GPU_CHECK(cudaMemcpyAsync(mat.data(), buf.data, mat.size() * sizeof(Scalar), cudaMemcpyDeviceToHost, stream));
-}
-
-}  // namespace test
-}  // namespace Eigen
-
-#endif  // EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
--- a/test/gpu_test_helper.h
+++ b/test/gpu_test_helper.h
@@ -6,8 +6,10 @@
 // Allow gpu** macros for generic tests.
 #include <Eigen/src/Core/util/GpuHipCudaDefines.inc>

-// std::tuple cannot be used on device, so use our custom implementation there.
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+// std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that
+// doesn't allow std::tuple to compile for host code either. In these cases,
+// use our custom implementation.
+#if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000)
 #define EIGEN_USE_CUSTOM_TUPLE 1
 #else
 #define EIGEN_USE_CUSTOM_TUPLE 0
@@ -40,12 +42,6 @@ using tuple_impl::tuple;
 #undef EIGEN_USE_CUSTOM_TUPLE
 }  // namespace test_detail

-template <typename T>
-using decay_t = typename std::decay<T>::type;
-
-template <typename Func, typename... Args>
-using kernel_result_t = decltype(std::declval<Func>()(std::declval<Args>()...));
-
 template <size_t N, size_t Idx, typename OutputIndexSequence, typename... Ts>
 struct extract_output_indices_helper;

@@ -94,15 +90,14 @@ struct void_helper {
  // Non-void return value.
  template <typename Func, typename... Args>
  static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)
-      -> std::enable_if_t<!std::is_same<kernel_result_t<Func&&, Args&&...>, void>::value,
-                          kernel_result_t<Func&&, Args&&...>> {
+      -> std::enable_if_t<!std::is_same<decltype(func(args...)), void>::value, decltype(func(args...))> {
    return func(std::forward<Args>(args)...);
  }

  // Void return value.
  template <typename Func, typename... Args>
  static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)
-      -> std::enable_if_t<std::is_same<kernel_result_t<Func&&, Args&&...>, void>::value, Void> {
+      -> std::enable_if_t<std::is_same<decltype(func(args...)), void>::value, Void> {
    func(std::forward<Args>(args)...);
    return Void{};
  }
@@ -140,18 +135,18 @@ EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence<Indices...>, std::inde
  const uint8_t* read_end = buffer + capacity;
  read_ptr = Eigen::deserialize(read_ptr, read_end, input_size);
  // Create value-type instances to populate.
-  auto args = make_tuple(decay_t<Args>{}...);
+  auto args = make_tuple(typename std::decay<Args>::type{}...);
  EIGEN_UNUSED_VARIABLE(args);  // Avoid NVCC compile warning.
  // NVCC 9.1 requires us to spell out the template parameters explicitly.
-  read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, decay_t<Args>...>(args)...);
+  read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, typename std::decay<Args>::type...>(args)...);

  // Call function, with void->Void conversion so we are guaranteed a complete
  // output type.
-  auto result = void_helper::call(kernel, get<Indices, decay_t<Args>...>(args)...);
+  auto result = void_helper::call(kernel, get<Indices, typename std::decay<Args>::type...>(args)...);

  // Determine required output size.
  size_t output_size = Eigen::serialize_size(capacity);
-  output_size += Eigen::serialize_size(get<OutputIndices, decay_t<Args>...>(args)...);
+  output_size += Eigen::serialize_size(get<OutputIndices, typename std::decay<Args>::type...>(args)...);
  output_size += Eigen::serialize_size(result);

  // Always serialize required buffer size.
@@ -162,7 +157,7 @@ EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence<Indices...>, std::inde
  // Serialize outputs if they fit in the buffer.
  if (output_size <= capacity) {
    // Collect outputs and result.
-    write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, decay_t<Args>...>(args)...);
+    write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, typename std::decay<Args>::type...>(args)...);
    write_ptr = Eigen::serialize(write_ptr, write_end, result);
  }
 }
@@ -287,7 +282,7 @@ auto run_serialized_on_gpu(size_t buffer_capacity_hint, std::index_sequence<Indi
 * \return kernel(args...).
 */
 template <typename Kernel, typename... Args>
-auto run_on_cpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
+auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
  return kernel(std::forward<Args>(args)...);
 }

@@ -306,7 +301,7 @@ auto run_on_cpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kern
 * \return kernel(args...).
 */
 template <typename Kernel, typename... Args>
-auto run_on_gpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
+auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
  return internal::run_serialized_on_gpu<Kernel, Args...>(
      /*buffer_capacity_hint=*/0, std::make_index_sequence<sizeof...(Args)>{},
      internal::extract_output_indices<Args...>{}, kernel, std::forward<Args>(args)...);
@@ -327,8 +322,7 @@ auto run_on_gpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kern
 * \sa run_on_gpu
 */
 template <typename Kernel, typename... Args>
-auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args)
-    -> internal::kernel_result_t<Kernel, Args&&...> {
+auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
  return internal::run_serialized_on_gpu<Kernel, Args...>(
      buffer_capacity_hint, std::make_index_sequence<sizeof...(Args)>{}, internal::extract_output_indices<Args...>{},
      kernel, std::forward<Args>(args)...);
@@ -415,7 +409,7 @@ void print_gpu_device_info() {
 * \return kernel(args...).
 */
 template <typename Kernel, typename... Args>
-auto run(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
+auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
 #ifdef EIGEN_GPUCC
  return run_on_gpu(kernel, std::forward<Args>(args)...);
 #else
@@ -438,8 +432,7 @@ auto run(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Arg
 * \sa run
 */
 template <typename Kernel, typename... Args>
-auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args)
-    -> internal::kernel_result_t<Kernel, Args&&...> {
+auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
 #ifdef EIGEN_GPUCC
  return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward<Args>(args)...);
 #else
--- a/test/main.h
+++ b/test/main.h
@@ -76,8 +76,10 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
+#if CUDA_VERSION >= 7050
 #include <cuda_fp16.h>
 #endif
+#endif

 #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
 #define EIGEN_TEST_NO_LONGDOUBLE
@@ -947,37 +949,6 @@ inline void set_seed_from_time() {
  g_seed = static_cast<decltype(g_seed)>(ns);
 }

-#if defined(EIGEN_USE_GPU)
-inline int maybe_skip_gpu_tests() {
-#if defined(EIGEN_USE_HIP)
-  int device_count = 0;
-  hipError_t status = hipGetDeviceCount(&device_count);
-  if (status != hipSuccess) {
-    std::cout << "SKIP: HIP GPU tests require a visible ROCm device. hipGetDeviceCount failed with: "
-              << hipGetErrorString(status) << std::endl;
-    return 77;
-  }
-  if (device_count <= 0) {
-    std::cout << "SKIP: HIP GPU tests require a visible ROCm device." << std::endl;
-    return 77;
-  }
-#elif defined(EIGEN_CUDACC)
-  int device_count = 0;
-  cudaError_t status = cudaGetDeviceCount(&device_count);
-  if (status != cudaSuccess) {
-    std::cout << "SKIP: CUDA GPU tests require a visible CUDA device. cudaGetDeviceCount failed with: "
-              << cudaGetErrorString(status) << std::endl;
-    return 77;
-  }
-  if (device_count <= 0) {
-    std::cout << "SKIP: CUDA GPU tests require a visible CUDA device." << std::endl;
-    return 77;
-  }
-#endif
-  return 0;
-}
-#endif
-
 int main(int argc, char* argv[]) {
  g_has_set_repeat = false;
  g_has_set_seed = false;
@@ -1026,13 +997,6 @@ int main(int argc, char* argv[]) {
  srand(g_seed);
  std::cout << "Repeating each test " << g_repeat << " times" << std::endl;

-#if defined(EIGEN_USE_GPU)
-  {
-    const int skip_code = maybe_skip_gpu_tests();
-    if (skip_code != 0) return skip_code;
-  }
-#endif
-
  VERIFY(EigenTest::all().size() > 0);

  for (std::size_t i = 0; i < EigenTest::all().size(); ++i) {
--- a/test/ulp_accuracy/ulp_accuracy.cpp
+++ b/test/ulp_accuracy/ulp_accuracy.cpp
@@ -233,15 +233,17 @@ static std::vector<FuncEntry<Scalar>> build_func_table() {
 // Range iteration helpers
 // ============================================================================

-// Advances x toward +inf by at least 1 ULP. When step_eps > 0, additionally
-// jumps by a relative factor of (1 + step_eps) to sample the range sparsely.
+// Advances a non-negative value toward +inf by at least 1 ULP. When step_eps > 0,
+// additionally jumps by max(|x|, min_normal) * step_eps. For normals this is
+// equivalent to x * (1 + eps). For denormals where x * eps < smallest_denormal,
+// the min_normal floor ensures we still skip through the denormal region at a
+// rate matching the smallest normals rather than stalling at 1 ULP per step.
 template <typename Scalar>
-static inline Scalar advance_by_step(Scalar x, double step_eps) {
+static inline Scalar advance_positive(Scalar x, double step_eps) {
  Scalar next = std::nextafter(x, std::numeric_limits<Scalar>::infinity());
  if (step_eps > 0.0 && std::isfinite(next)) {
-    // Try to jump further by a relative amount.
-    Scalar jumped = next > 0 ? next * static_cast<Scalar>(1.0 + step_eps) : next / static_cast<Scalar>(1.0 + step_eps);
-    // Use the jump only if it actually advances further (handles denormal stalling).
+    Scalar base = std::max(next, std::numeric_limits<Scalar>::min());
+    Scalar jumped = next + base * static_cast<Scalar>(step_eps);
    if (jumped > next) next = jumped;
  }
  return next;
@@ -281,26 +283,60 @@ static double linear_to_scalar(int64_t lin, double /*tag*/) {
 // Dynamic work queue: threads atomically claim chunks for load balancing
 // ============================================================================

+// Work queue that distributes chunks in positive absolute-value linear space.
+// Iteration goes outward from 0: the worker tests both +|x| and -|x| for
+// each sampled magnitude, so the multiplicative step (1 + eps) always works
+// cleanly — no special handling for negative values needed.
 template <typename Scalar>
 struct WorkQueue {
  int64_t range_hi_lin;
  int64_t chunk_size;
  double step_eps;
  std::atomic<int64_t> next_lin;
+  Scalar orig_lo;  // original range for sign filtering
+  Scalar orig_hi;
+  bool test_pos;  // whether any positive values are in [lo, hi]
+  bool test_neg;  // whether any negative values are in [lo, hi]

-  void init(Scalar lo, Scalar hi, int64_t csz, double step) {
-    range_hi_lin = scalar_to_linear(hi);
-    chunk_size = csz;
+  void init(Scalar lo, Scalar hi, int num_threads, double step) {
+    orig_lo = lo;
+    orig_hi = hi;
+    test_pos = (hi >= Scalar(0));
+    test_neg = (lo < Scalar(0));
+
+    // Compute absolute-value iteration range.
+    Scalar abs_lo, abs_hi;
+    if (lo <= Scalar(0) && hi >= Scalar(0)) {
+      abs_lo = Scalar(0);
+      abs_hi = std::max(std::abs(lo), hi);
+    } else {
+      abs_lo = std::min(std::abs(lo), std::abs(hi));
+      abs_hi = std::max(std::abs(lo), std::abs(hi));
+    }
+
+    range_hi_lin = scalar_to_linear(abs_hi);
    step_eps = step;
-    next_lin.store(scalar_to_linear(lo), std::memory_order_relaxed);
+    next_lin.store(scalar_to_linear(abs_lo), std::memory_order_relaxed);
+
+    uint64_t total_abs = count_scalars_in_range(abs_lo, abs_hi);
+    chunk_size = std::max(int64_t(1), static_cast<int64_t>(total_abs / (num_threads * 16)));
+    if (step > 0.0) {
+      // Ensure chunks are large enough that advance_positive's min_normal floor
+      // can actually skip the denormal region.  The denormal region contains
+      // count_scalars_in_range(0, min_normal) ULPs; any chunk must span at
+      // least that many so the min_normal-based jump lands past chunk_hi.
+      int64_t denorm_span = static_cast<int64_t>(count_scalars_in_range(Scalar(0), std::numeric_limits<Scalar>::min()));
+      chunk_size = std::max(chunk_size, denorm_span);
+    }
  }

-  // Claim the next chunk. Returns false when no work remains.
+  // Claim the next chunk of absolute values. Returns false when no work remains.
  bool claim(Scalar& chunk_lo, Scalar& chunk_hi) {
    int64_t lo_lin = next_lin.fetch_add(chunk_size, std::memory_order_relaxed);
-    if (lo_lin > range_hi_lin) return false;
-    int64_t hi_lin = lo_lin + chunk_size - 1;
-    if (hi_lin > range_hi_lin) hi_lin = range_hi_lin;
+    if (lo_lin > range_hi_lin || lo_lin < 0) return false;
+    // Compute hi_lin carefully to avoid int64_t overflow.
+    int64_t remaining = range_hi_lin - lo_lin;
+    int64_t hi_lin = (remaining < chunk_size - 1) ? range_hi_lin : lo_lin + chunk_size - 1;
    chunk_lo = linear_to_scalar(lo_lin, Scalar(0));
    chunk_hi = linear_to_scalar(hi_lin, Scalar(0));
    return true;
@@ -322,8 +358,12 @@ static void worker(const FuncEntry<Scalar>& func, WorkQueue<Scalar>& queue, int
 #ifdef EIGEN_HAS_MPFR
  mpfr_t mp_in, mp_out;
  if (use_mpfr) {
-    mpfr_init2(mp_in, 128);
-    mpfr_init2(mp_out, 128);
+    // Use 2x the mantissa bits of Scalar for the reference: 48 for float (24-bit
+    // mantissa), 106 for double (53-bit mantissa). This is sufficient for correctly-
+    // rounded results while keeping MPFR evaluation fast.
+    constexpr int kMpfrBits = std::is_same<Scalar, float>::value ? 48 : 106;
+    mpfr_init2(mp_in, kMpfrBits);
+    mpfr_init2(mp_out, kMpfrBits);
  }
 #else
  (void)use_mpfr;
@@ -348,32 +388,42 @@ static void worker(const FuncEntry<Scalar>& func, WorkQueue<Scalar>& queue, int
    }
  };

+  auto flush_batch = [&](int& idx) {
+    if (idx == 0) return;
+    for (int i = idx; i < batch_size; i++) input[i] = input[idx - 1];
+    func.eigen_eval(eigen_out, input);
+    process_batch(idx, input, eigen_out);
+    idx = 0;
+  };
+
+  auto push_value = [&](Scalar v, int& idx) {
+    input[idx++] = v;
+    if (idx == batch_size) flush_batch(idx);
+  };
+
  Scalar chunk_lo, chunk_hi;
  while (queue.claim(chunk_lo, chunk_hi)) {
    int idx = 0;
-    Scalar x = chunk_lo;
+    Scalar abs_x = chunk_lo;
    for (;;) {
-      input[idx] = x;
-      idx++;
-
-      if (idx == batch_size) {
-        func.eigen_eval(eigen_out, input);
-        process_batch(batch_size, input, eigen_out);
-        idx = 0;
+      // Test +|x| if positive values are in range.
+      if (queue.test_pos && abs_x >= queue.orig_lo && abs_x <= queue.orig_hi) {
+        push_value(abs_x, idx);
+      }
+      // Test -|x| if negative values are in range (skip -0 to avoid testing 0 twice).
+      if (queue.test_neg && abs_x != Scalar(0)) {
+        Scalar neg_x = -abs_x;
+        if (neg_x >= queue.orig_lo && neg_x <= queue.orig_hi) {
+          push_value(neg_x, idx);
+        }
      }

-      if (x >= chunk_hi) break;
-      Scalar next = advance_by_step(x, queue.step_eps);
-      x = (next > chunk_hi) ? chunk_hi : next;
+      if (abs_x >= chunk_hi) break;
+      Scalar next = advance_positive(abs_x, queue.step_eps);
+      abs_x = (next > chunk_hi) ? chunk_hi : next;
    }

-    // Process remaining partial batch.  Pad unused slots with the last valid
-    // input so the full-size vectorized eval doesn't read uninitialized memory.
-    if (idx > 0) {
-      for (int i = idx; i < batch_size; i++) input[i] = input[idx - 1];
-      func.eigen_eval(eigen_out, input);
-      process_batch(idx, input, eigen_out);
-    }
+    flush_batch(idx);
  }

 #ifdef EIGEN_HAS_MPFR
@@ -439,11 +489,12 @@ static int run_test(const Options& opts) {
  std::printf("Function: %s (%s)\n", opts.func_name.c_str(), kTypeName);
  std::printf("Range: [%.*g, %.*g]\n", kDigits, double(lo), kDigits, double(hi));
  if (opts.step_eps > 0.0) {
-    std::printf("Sampling step: (1 + %g) * nextafter(x)\n", opts.step_eps);
+    std::printf("Sampling step: |x| * (1 + %g)\n", opts.step_eps);
  } else {
    std::printf("Representable values in range: %lu\n", static_cast<unsigned long>(total_scalars));
  }
-  std::printf("Reference: %s\n", opts.use_mpfr ? "MPFR (128-bit)" : "std C++ math");
+  std::printf("Reference: %s\n",
+              opts.use_mpfr ? (opts.use_double ? "MPFR (106-bit)" : "MPFR (48-bit)") : "std C++ math");
  std::printf("Threads: %d\n", num_threads);
  std::printf("Batch size: %d\n", opts.batch_size);
  std::printf("\n");
@@ -459,13 +510,8 @@ static int run_test(const Options& opts) {
    results.back()->init(opts.hist_width);
  }

-  // Use dynamic work distribution: threads claim small chunks from a shared
-  // queue.  This ensures even load balancing regardless of how per-value
-  // work varies across the range (e.g. log on negatives is trivial).
-  // Choose chunk_size so we get ~16 chunks per thread for good balancing.
-  int64_t chunk_size = std::max(int64_t(1), static_cast<int64_t>(total_scalars / (num_threads * 16)));
  WorkQueue<Scalar> queue;
-  queue.init(lo, hi, chunk_size, opts.step_eps);
+  queue.init(lo, hi, num_threads, opts.step_eps);

  std::vector<std::thread> threads;
  auto start_time = std::chrono::steady_clock::now();
--- a/unsupported/Eigen/src/Tensor/TensorContractionGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorContractionGpu.h
@@ -393,8 +393,7 @@ __device__ EIGEN_STRONG_INLINE void EigenContractionKernelInternal(const LhsMapp
  // the sum across all big k blocks of the product of little k block of index (x, y)
  // with block of index (y, z). To compute the final output, we need to reduce
  // the 8 threads over y by summation.
-  // HIP uses non-sync warp shuffles; CUDA requires the _sync variants.
-#if defined(EIGEN_HIPCC)
+#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
 #else
 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
@@ -623,7 +622,7 @@ __device__ __forceinline__ void EigenFloatContractionKernelInternal16x16(const L
      x1 = rhs_pf0.x;
      x2 = rhs_pf0.z;
    }
-#if defined(EIGEN_HIPCC)
+#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
    x1 = __shfl_xor(x1, 4);
    x2 = __shfl_xor(x2, 4);
 #else
@@ -1378,6 +1377,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
                  this->m_right_contracting_strides, this->m_k_strides);

    OutputMapper output(buffer, m);
+
+#if defined(EIGEN_USE_HIP)
+    setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
+#else
+    setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
+#endif
+
    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k,
                                                                                        this->m_device);
  }
--- a/unsupported/Eigen/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/src/Tensor/TensorConvolution.h
@@ -89,7 +89,7 @@ class IndexMapper {
      }
    } else {
      for (int i = NumDims - 1; i >= 0; --i) {
-        if (i + 1 < static_cast<int>(offset)) {
+        if (static_cast<size_t>(i + 1) < offset) {
          m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
          m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
        } else {
--- a/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h
@@ -342,6 +342,19 @@ struct GpuDevice {

 #endif

+// FIXME: Should be device and kernel specific.
+#ifdef EIGEN_GPUCC
+static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
+  EIGEN_UNUSED_VARIABLE(status);
+  gpu_assert(status == gpuSuccess);
+#else
+  EIGEN_UNUSED_VARIABLE(config);
+#endif
+}
+#endif
+
 }  // end namespace Eigen

 // undefine all the gpu* macros we defined at the beginning of the file
--- a/unsupported/Eigen/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/src/Tensor/TensorEvaluator.h
@@ -175,7 +175,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T loadConstant(const T* address) {
  return *address;
 }
 // Use the texture cache on CUDA devices whenever possible
-#if defined(EIGEN_CUDA_ARCH)
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) {
  return __ldg(address);
--- a/unsupported/Eigen/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/src/Tensor/TensorMeta.h
@@ -49,7 +49,7 @@ struct PacketType : internal::packet_traits<Scalar> {
 };

 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE)

 typedef ulonglong2 Packet4h2;
 template <>
--- a/unsupported/Eigen/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/src/Tensor/TensorReduction.h
@@ -453,7 +453,7 @@ template <int B, int N, typename S, typename R, typename I_>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*,
                                                                 unsigned int*);

-#if defined(EIGEN_GPUCC)
+#if defined(EIGEN_HAS_GPU_FP16)
 template <typename S, typename R, typename I_>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
    R, const S, I_, internal::packet_traits<half>::type*);
@@ -883,7 +883,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
  template <int B, int N, typename S, typename R, typename I_>
  KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
-#if defined(EIGEN_GPUCC)
+#if defined(EIGEN_HAS_GPU_FP16)
  template <typename S, typename R, typename I_>
  KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_,
                                                                     internal::packet_traits<Eigen::half>::type*);
--- a/unsupported/Eigen/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorReductionGpu.h
@@ -25,6 +25,7 @@ namespace internal {
 // updated the content of the output address it will try again.
 template <typename T, typename R>
 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  if (sizeof(T) == 4) {
    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
    unsigned int newval = oldval;
@@ -60,6 +61,12 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
  } else {
    gpu_assert(0 && "Wordsize not supported");
  }
+#else   // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(accum);
+  EIGEN_UNUSED_VARIABLE(reducer);
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif  // EIGEN_CUDA_ARCH >= 300
 }

 // We extend atomicExch to support extra data types
@@ -68,42 +75,13 @@ __device__ inline Type atomicExchCustom(Type* address, Type val) {
  return atomicExch(address, val);
 }

-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR auto reduction_shuffle_mask() {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-  return 0xFFFFFFFFFFFFFFFFull;
-#else
-  return 0xFFFFFFFFu;
-#endif
-}
-
-template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T reduction_shuffle_down(T value, int offset) {
-  return __shfl_down_sync(reduction_shuffle_mask<T>(), value, offset, warpSize);
-}
-
-template <>
-__device__ EIGEN_ALWAYS_INLINE int reduction_shuffle_down<int>(int value, int offset) {
-  return __shfl_down_sync(reduction_shuffle_mask<int>(), value, offset, warpSize);
-}
-
-template <>
-__device__ EIGEN_ALWAYS_INLINE float reduction_shuffle_down<float>(float value, int offset) {
-  return __shfl_down_sync(reduction_shuffle_mask<float>(), value, offset, warpSize);
-}
-
-template <>
-__device__ EIGEN_ALWAYS_INLINE double reduction_shuffle_down<double>(double value, int offset) {
-  return __shfl_down_sync(reduction_shuffle_mask<double>(), value, offset, warpSize);
-}
-
 template <>
 __device__ inline double atomicExchCustom(double* address, double val) {
  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
 }

-// Half-float reduction specializations.
+#ifdef EIGEN_HAS_GPU_FP16
 template <typename R>
 __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
@@ -133,10 +111,17 @@ __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reduc
  }
 }
 #endif  // EIGEN_GPU_COMPILE_PHASE
+#endif  // EIGEN_HAS_GPU_FP16

 template <>
 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  atomicAdd(output, accum);
+#else   // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(accum);
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif  // EIGEN_CUDA_ARCH >= 300
 }

 template <typename CoeffType, typename Index>
@@ -153,6 +138,7 @@ template <int BlockSize, int NumPerThread, typename Self, typename Reducer, type
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
                                                                 typename Self::CoeffReturnType* output,
                                                                 unsigned int* semaphore) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  // Initialize the output value
  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
  if (gridDim.x == 1) {
@@ -193,7 +179,20 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer

 #pragma unroll
  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-    reducer.reduce(reduction_shuffle_down(accum, offset), &accum);
+#if defined(EIGEN_HIPCC)
+    // use std::is_floating_point to determine the type of reduced_val
+    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error
+    // and list the float and int versions of __shfl_down as the candidate functions.
+    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
+      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
+    } else {
+      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
+    }
+#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+#else
+    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
+#endif
  }

  if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -207,9 +206,17 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer
    __threadfence_system();
 #endif
  }
+#else   // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(reducer);
+  EIGEN_UNUSED_VARIABLE(input);
+  EIGEN_UNUSED_VARIABLE(num_coeffs);
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(semaphore);
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif  // EIGEN_CUDA_ARCH >= 300
 }

-// Half-float reduction specializations.
+#ifdef EIGEN_HAS_GPU_FP16
 template <typename Self, typename Reducer, typename Index>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input,
                                                                                   Index num_coeffs, half* scratch) {
@@ -312,6 +319,14 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce
      hr[i] = wka_out.h;
    }
    reducer.reducePacket(r1, &accum);
+#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down(hacc[i], offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
 #else
    PacketType r1;
    half2* hr = reinterpret_cast<half2*>(&r1);
@@ -362,6 +377,8 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op
  }
 }

+#endif  // EIGEN_HAS_GPU_FP16
+
 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
 struct FullReductionLauncher {
  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
@@ -392,7 +409,7 @@ struct FullReductionLauncher<
  }
 };

-// Half-float reduction specializations.
+#ifdef EIGEN_HAS_GPU_FP16
 template <typename Self, typename Op>
 struct FullReductionLauncher<Self, Op, Eigen::half, false> {
  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
@@ -426,18 +443,24 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
    }
  }
 };
+#endif  // EIGEN_HAS_GPU_FP16

 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
  // Unfortunately nvidia doesn't support well exotic types such as complex,
  // so reduce the scope of the optimized version of the code to the simple cases
  // of doubles, floats and half floats
-  // Half-float reduction specializations.
+#ifdef EIGEN_HAS_GPU_FP16
  static constexpr bool HasOptimizedImplementation =
      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
                                           internal::is_same<typename Self::CoeffReturnType, double>::value ||
                                           (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
                                            reducer_traits<Op, GpuDevice>::PacketAccess));
+#else   // EIGEN_HAS_GPU_FP16
+  static constexpr bool HasOptimizedImplementation =
+      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                           internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif  // EIGEN_HAS_GPU_FP16

  template <typename OutputType>
  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
@@ -458,6 +481,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce
                                                                  Index num_coeffs_to_reduce,
                                                                  Index num_preserved_coeffs,
                                                                  typename Self::CoeffReturnType* output) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  typedef typename Self::CoeffReturnType Type;
  eigen_assert(blockDim.y == 1);
  eigen_assert(blockDim.z == 1);
@@ -510,7 +534,20 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce

 #pragma unroll
      for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-        reducer.reduce(reduction_shuffle_down(reduced_val, offset), &reduced_val);
+#if defined(EIGEN_HIPCC)
+        // use std::is_floating_point to determine the type of reduced_val
+        // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error
+        // and list the float and int versions of __shfl_down as the candidate functions.
+        if (std::is_floating_point<Type>::value) {
+          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
+        } else {
+          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
+        }
+#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+#else
+        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
+#endif
      }

      if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -518,9 +555,17 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reduce
      }
    }
  }
+#else   // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(reducer);
+  EIGEN_UNUSED_VARIABLE(input);
+  EIGEN_UNUSED_VARIABLE(num_coeffs_to_reduce);
+  EIGEN_UNUSED_VARIABLE(num_preserved_coeffs);
+  EIGEN_UNUSED_VARIABLE(output);
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif  // EIGEN_CUDA_ARCH >= 300
 }

-// Half-float reduction specializations.
+#ifdef EIGEN_HAS_GPU_FP16

 template <int NumPerThread, typename Self, typename Reducer, typename Index>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
@@ -643,6 +688,19 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
        }
        reducer.reducePacket(r1, &reduced_val1);
        reducer.reducePacket(r2, &reduced_val2);
+#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          hr1[i] = __shfl_down(rv1[i], offset, warpSize);
+          hr2[i] = __shfl_down(rv2[i], offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
 #else
        PacketType r1;
        PacketType r2;
@@ -683,6 +741,8 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
  }
 }

+#endif  // EIGEN_HAS_GPU_FP16
+
 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
 struct InnerReductionLauncher {
  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index,
@@ -726,7 +786,7 @@ struct InnerReductionLauncher<
  }
 };

-// Half-float reduction specializations.
+#ifdef EIGEN_HAS_GPU_FP16
 template <typename Self, typename Op>
 struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
@@ -766,18 +826,24 @@ struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
    return false;
  }
 };
+#endif  // EIGEN_HAS_GPU_FP16

 template <typename Self, typename Op>
 struct InnerReducer<Self, Op, GpuDevice> {
  // Unfortunately nvidia doesn't support well exotic types such as complex,
  // so reduce the scope of the optimized version of the code to the simple case
  // of floats and half floats.
-  // Half-float reduction specializations.
+#ifdef EIGEN_HAS_GPU_FP16
  static constexpr bool HasOptimizedImplementation =
      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
                                           internal::is_same<typename Self::CoeffReturnType, double>::value ||
                                           (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
                                            reducer_traits<Op, GpuDevice>::PacketAccess));
+#else   // EIGEN_HAS_GPU_FP16
+  static constexpr bool HasOptimizedImplementation =
+      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                           internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif  // EIGEN_HAS_GPU_FP16

  template <typename OutputType>
  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -237,7 +237,7 @@ if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MS
  ei_add_test(cxx11_tensor_uint128)
 endif()

-find_package(CUDA 11.4)
+find_package(CUDA 9.0)
 if(CUDA_FOUND AND EIGEN_TEST_CUDA)
  # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
  # and -fno-check-new flags since they trigger thousands of compilation warnings
@@ -281,11 +281,26 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
  ei_add_test(cxx11_tensor_argmax_gpu)
  ei_add_test(cxx11_tensor_cast_float16_gpu)
  ei_add_test(cxx11_tensor_scan_gpu)
-  ei_add_test(cxx11_tensor_device)
-  ei_add_test(cxx11_tensor_gpu)
-  ei_add_test(cxx11_tensor_contract_gpu)
-  ei_add_test(cxx11_tensor_of_float16_gpu)
-  ei_add_test(cxx11_tensor_random_gpu)
+
+  set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH 9999)
+  foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+    if(${ARCH} LESS ${EIGEN_CUDA_OLDEST_COMPUTE_ARCH})
+      set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH ${ARCH})
+    endif()
+  endforeach()
+
+  # Contractions require arch 3.0 or higher
+  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 29)
+    ei_add_test(cxx11_tensor_device)
+    ei_add_test(cxx11_tensor_gpu)
+    ei_add_test(cxx11_tensor_contract_gpu)
+    ei_add_test(cxx11_tensor_of_float16_gpu)
+  endif()
+
+  # The random number generation code requires arch 3.5 or greater.
+  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 34)
+    ei_add_test(cxx11_tensor_random_gpu)
+  endif()

  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
@@ -326,6 +341,7 @@ if (EIGEN_TEST_HIP)
      ei_add_test(cxx11_tensor_cast_float16_gpu)
      ei_add_test(cxx11_tensor_scan_gpu)
      ei_add_test(cxx11_tensor_device)
+
      ei_add_test(cxx11_tensor_gpu)
      ei_add_test(cxx11_tensor_contract_gpu)
      ei_add_test(cxx11_tensor_of_float16_gpu)
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -850,7 +850,6 @@ void test_gpu_igamma() {
  Tensor<Scalar, 2> a(6, 6);
  Tensor<Scalar, 2> x(6, 6);
  Tensor<Scalar, 2> out(6, 6);
-  Tensor<Scalar, 2> expected_out(6, 6);
  out.setZero();

  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
@@ -863,11 +862,14 @@ void test_gpu_igamma() {
    }
  }

-  for (int i = 0; i < 6; ++i) {
-    for (int j = 0; j < 6; ++j) {
-      expected_out(i, j) = numext::igamma(a(i, j), x(i, j));
-    }
-  }
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar igamma_s[][6] = {
+      {0.0, nan, nan, nan, nan, nan},
+      {0.0, 0.6321205588285578, 0.7768698398515702, 0.9816843611112658, 9.999500016666262e-05, 1.0},
+      {0.0, 0.4275932955291202, 0.608374823728911, 0.9539882943107686, 7.522076445089201e-07, 1.0},
+      {0.0, 0.01898815687615381, 0.06564245437845008, 0.5665298796332909, 4.166333347221828e-18, 1.0},
+      {0.0, 0.9999780593618628, 0.9999899967080838, 0.9999996219837988, 0.9991370418689945, 1.0},
+      {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};

  std::size_t bytes = a.size() * sizeof(Scalar);

@@ -895,10 +897,10 @@ void test_gpu_igamma() {

  for (int i = 0; i < 6; ++i) {
    for (int j = 0; j < 6; ++j) {
-      if ((std::isnan)(expected_out(i, j))) {
+      if ((std::isnan)(igamma_s[i][j])) {
        VERIFY((std::isnan)(out(i, j)));
      } else {
-        VERIFY_IS_APPROX(out(i, j), expected_out(i, j));
+        VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
      }
    }
  }
@@ -913,7 +915,6 @@ void test_gpu_igammac() {
  Tensor<Scalar, 2> a(6, 6);
  Tensor<Scalar, 2> x(6, 6);
  Tensor<Scalar, 2> out(6, 6);
-  Tensor<Scalar, 2> expected_out(6, 6);
  out.setZero();

  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
@@ -926,11 +927,14 @@ void test_gpu_igammac() {
    }
  }

-  for (int i = 0; i < 6; ++i) {
-    for (int j = 0; j < 6; ++j) {
-      expected_out(i, j) = numext::igammac(a(i, j), x(i, j));
-    }
-  }
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar igammac_s[][6] = {
+      {nan, nan, nan, nan, nan, nan},
+      {1.0, 0.36787944117144233, 0.22313016014842982, 0.018315638888734182, 0.9999000049998333, 0.0},
+      {1.0, 0.5724067044708798, 0.3916251762710878, 0.04601170568923136, 0.9999992477923555, 0.0},
+      {1.0, 0.9810118431238462, 0.9343575456215499, 0.4334701203667089, 1.0, 0.0},
+      {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, 3.7801620118431334e-07, 0.0008629581310054535, 0.0},
+      {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};

  std::size_t bytes = a.size() * sizeof(Scalar);

@@ -958,10 +962,10 @@ void test_gpu_igammac() {

  for (int i = 0; i < 6; ++i) {
    for (int j = 0; j < 6; ++j) {
-      if ((std::isnan)(expected_out(i, j))) {
+      if ((std::isnan)(igammac_s[i][j])) {
        VERIFY((std::isnan)(out(i, j)));
      } else {
-        VERIFY_IS_APPROX(out(i, j), expected_out(i, j));
+        VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
      }
    }
  }
@@ -1064,9 +1068,15 @@ void test_gpu_ndtri() {
  in_x(7) = Scalar(0.99);
  in_x(8) = Scalar(0.01);

-  for (int i = 0; i < 9; ++i) {
-    expected_out(i) = numext::ndtri(in_x(i));
-  }
+  expected_out(0) = std::numeric_limits<Scalar>::infinity();
+  expected_out(1) = -std::numeric_limits<Scalar>::infinity();
+  expected_out(2) = Scalar(0.0);
+  expected_out(3) = Scalar(-0.8416212335729142);
+  expected_out(4) = Scalar(0.8416212335729142);
+  expected_out(5) = Scalar(1.2815515655446004);
+  expected_out(6) = Scalar(-1.2815515655446004);
+  expected_out(7) = Scalar(2.3263478740408408);
+  expected_out(8) = Scalar(-2.3263478740408408);

  std::size_t bytes = in_x.size() * sizeof(Scalar);

@@ -1080,15 +1090,15 @@ void test_gpu_ndtri() {
  Eigen::GpuStreamDevice stream;
  Eigen::GpuDevice gpu_device(&stream);

-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 9);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 9);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);

  gpu_out.device(gpu_device) = gpu_in_x.ndtri();

  assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);

-  for (int i = 0; i < 9; ++i) {
+  for (int i = 0; i < 6; ++i) {
    VERIFY_IS_CWISE_APPROX(out(i), expected_out(i));
  }

@@ -1105,9 +1115,12 @@ void test_gpu_betainc() {
  Tensor<Scalar, 1> expected_out(125);
  out.setZero();

+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
  Array<Scalar, 1, Dynamic> x(125);
  Array<Scalar, 1, Dynamic> a(125);
  Array<Scalar, 1, Dynamic> b(125);
+  Array<Scalar, 1, Dynamic> v(125);

  a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
@@ -1147,11 +1160,25 @@ void test_gpu_betainc() {
      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;

+  v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+      nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+      0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, 0.999995949033062, 0.9999999999993698,
+      0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan,
+      nan, nan, 0.006827081192655869, 0.0210336989586256, 0.04813160422599567, nan, nan, 0.20014344256217678,
+      0.5000000000000001, 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, 0.9999999999999999, nan,
+      nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+      1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, nan, 7.864342668429763e-23,
+      3.015969667594166e-10, 0.0008598571564165444, nan, nan, 6.031987710123844e-08, 0.5000000000000007,
+      0.9999999396801229, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan,
+      nan, nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
+      1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan,
+      2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan;
+
  for (int i = 0; i < 125; ++i) {
    in_x(i) = x(i);
    in_a(i) = a(i);
    in_b(i) = b(i);
-    expected_out(i) = numext::betainc(a(i), b(i), x(i));
+    expected_out(i) = v(i);
  }

  std::size_t bytes = in_x.size() * sizeof(Scalar);
--- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
@@ -53,6 +53,8 @@ void test_gpu_numext() {
  gpu_device.deallocate(d_res_float);
 }

+#ifdef EIGEN_HAS_GPU_FP16
+
 template <typename>
 void test_gpu_conversion() {
  Eigen::GpuStreamDevice stream;
@@ -440,10 +442,12 @@ void test_gpu_forced_evals() {
  gpu_device.deallocate(d_res_half2);
  gpu_device.deallocate(d_res_float);
 }
+#endif

 EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) {
  CALL_SUBTEST_1(test_gpu_numext<void>());

+#ifdef EIGEN_HAS_GPU_FP16
  CALL_SUBTEST_1(test_gpu_conversion<void>());
  CALL_SUBTEST_1(test_gpu_unary<void>());
  CALL_SUBTEST_1(test_gpu_elementwise<void>());
@@ -452,4 +456,7 @@ EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) {
  CALL_SUBTEST_3(test_gpu_reductions<void>());
  CALL_SUBTEST_4(test_gpu_full_reductions<void>());
  CALL_SUBTEST_5(test_gpu_forced_evals<void>());
+#else
+  std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
+#endif
 }