GPU: Add sparse solvers, FFT, and SpMV (cuDSS, cuFFT, cuSPARSE)

Add GPU sparse direct solvers (Cholesky, LDL^T, LU) via cuDSS, 1D/2D FFT via cuFFT with plan caching, and sparse matrix-vector/matrix multiply (SpMV/SpMM) via cuSPARSE. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-10 11:34:33 +08:00 · 2026-04-09 19:11:49 -07:00
parent 8593c7f5a1
commit 43a95b62bb
22 changed files with 2970 additions and 126 deletions
--- a/Eigen/GPU
+++ b/Eigen/GPU
@@ -50,6 +50,17 @@
 #include "src/GPU/GpuQR.h"
 #include "src/GPU/GpuSVD.h"
 #include "src/GPU/GpuEigenSolver.h"
 #include "src/GPU/CuFftSupport.h"
 #include "src/GPU/GpuFFT.h"
 #include "src/GPU/CuSparseSupport.h"
 #include "src/GPU/GpuSparseContext.h"
 #ifdef EIGEN_CUDSS
 #include "src/GPU/CuDssSupport.h"
 #include "src/GPU/GpuSparseSolverBase.h"
 #include "src/GPU/GpuSparseLLT.h"
 #include "src/GPU/GpuSparseLDLT.h"
 #include "src/GPU/GpuSparseLU.h"
 #endif
 // IWYU pragma: end_exports
 #endif
--- a/Eigen/src/GPU/CuDssSupport.h
+++ b/Eigen/src/GPU/CuDssSupport.h
@@ -0,0 +1,134 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // cuDSS support utilities: error checking macro, type mapping.
 //
 // cuDSS is NVIDIA's sparse direct solver library, supporting Cholesky (LL^T),
 // LDL^T, and LU factorization on GPU. It requires CUDA 12.0+ and is
 // distributed separately from the CUDA Toolkit.
 #ifndef EIGEN_GPU_CUDSS_SUPPORT_H
 #define EIGEN_GPU_CUDSS_SUPPORT_H
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 #include "./GpuSupport.h"
 #include <cudss.h>
 namespace Eigen {
 namespace internal {
 // ---- Error checking ---------------------------------------------------------
 #define EIGEN_CUDSS_CHECK(x)                                              \
  do {                                                                    \
    cudssStatus_t _s = (x);                                               \
    eigen_assert(_s == CUDSS_STATUS_SUCCESS && "cuDSS call failed: " #x); \
    EIGEN_UNUSED_VARIABLE(_s);                                            \
  } while (0)
 // ---- Scalar → cudssMatrixType_t for SPD/HPD ---------------------------------
 template <typename Scalar>
 struct cudss_spd_type;
 template <>
 struct cudss_spd_type<float> {
  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
 };
 template <>
 struct cudss_spd_type<double> {
  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
 };
 template <>
 struct cudss_spd_type<std::complex<float>> {
  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
 };
 template <>
 struct cudss_spd_type<std::complex<double>> {
  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
 };
 // ---- Scalar → cudssMatrixType_t for symmetric/Hermitian ---------------------
 template <typename Scalar>
 struct cudss_symmetric_type;
 template <>
 struct cudss_symmetric_type<float> {
  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
 };
 template <>
 struct cudss_symmetric_type<double> {
  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
 };
 template <>
 struct cudss_symmetric_type<std::complex<float>> {
  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
 };
 template <>
 struct cudss_symmetric_type<std::complex<double>> {
  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
 };
 // ---- StorageIndex → cudaDataType_t ------------------------------------------
 template <typename StorageIndex>
 struct cudss_index_type;
 template <>
 struct cudss_index_type<int> {
  static constexpr cudaDataType_t value = CUDA_R_32I;
 };
 template <>
 struct cudss_index_type<int64_t> {
  static constexpr cudaDataType_t value = CUDA_R_64I;
 };
 // ---- UpLo → cudssMatrixViewType_t -------------------------------------------
 // For symmetric matrices stored as CSC (ColMajor), cuDSS sees CSR of A^T.
 // Since A = A^T, the data is the same, but the triangle view must be swapped.
 template <int UpLo, int StorageOrder>
 struct cudss_view_type;
 // ColMajor (CSC) passed as CSR: lower ↔ upper swap.
 template <>
 struct cudss_view_type<Lower, ColMajor> {
  static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
 };
 template <>
 struct cudss_view_type<Upper, ColMajor> {
  static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
 };
 // RowMajor (CSR) passed directly: no swap needed.
 template <>
 struct cudss_view_type<Lower, RowMajor> {
  static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
 };
 template <>
 struct cudss_view_type<Upper, RowMajor> {
  static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
 };
 }  // namespace internal
 // ---- Ordering enum ----------------------------------------------------------
 enum class GpuSparseOrdering {
  AMD,    // Default fill-reducing ordering
  METIS,  // METIS nested dissection
  RCM     // Reverse Cuthill-McKee
 };
 }  // namespace Eigen
 #endif  // EIGEN_GPU_CUDSS_SUPPORT_H
--- a/Eigen/src/GPU/CuFftSupport.h
+++ b/Eigen/src/GPU/CuFftSupport.h
@@ -0,0 +1,103 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // cuFFT support utilities: error checking macro, type mapping.
 #ifndef EIGEN_GPU_CUFFT_SUPPORT_H
 #define EIGEN_GPU_CUFFT_SUPPORT_H
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 #include "./GpuSupport.h"
 #include <cufft.h>
 namespace Eigen {
 namespace internal {
 // ---- Error checking ---------------------------------------------------------
 #define EIGEN_CUFFT_CHECK(x)                                       \
  do {                                                             \
    cufftResult _r = (x);                                          \
    eigen_assert(_r == CUFFT_SUCCESS && "cuFFT call failed: " #x); \
    EIGEN_UNUSED_VARIABLE(_r);                                     \
  } while (0)
 // ---- Scalar → cufftType traits ----------------------------------------------
 template <typename Scalar>
 struct cufft_c2c_type;
 template <>
 struct cufft_c2c_type<float> {
  static constexpr cufftType value = CUFFT_C2C;
 };
 template <>
 struct cufft_c2c_type<double> {
  static constexpr cufftType value = CUFFT_Z2Z;
 };
 template <typename Scalar>
 struct cufft_r2c_type;
 template <>
 struct cufft_r2c_type<float> {
  static constexpr cufftType value = CUFFT_R2C;
 };
 template <>
 struct cufft_r2c_type<double> {
  static constexpr cufftType value = CUFFT_D2Z;
 };
 template <typename Scalar>
 struct cufft_c2r_type;
 template <>
 struct cufft_c2r_type<float> {
  static constexpr cufftType value = CUFFT_C2R;
 };
 template <>
 struct cufft_c2r_type<double> {
  static constexpr cufftType value = CUFFT_Z2D;
 };
 // ---- Type-dispatched cuFFT execution ----------------------------------------
 // C2C
 inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<float>* in, std::complex<float>* out,
                                         int direction) {
  return cufftExecC2C(plan, reinterpret_cast<cufftComplex*>(in), reinterpret_cast<cufftComplex*>(out), direction);
 }
 inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<double>* in, std::complex<double>* out,
                                         int direction) {
  return cufftExecZ2Z(plan, reinterpret_cast<cufftDoubleComplex*>(in), reinterpret_cast<cufftDoubleComplex*>(out),
                      direction);
 }
 // R2C
 inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, float* in, std::complex<float>* out) {
  return cufftExecR2C(plan, in, reinterpret_cast<cufftComplex*>(out));
 }
 inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, double* in, std::complex<double>* out) {
  return cufftExecD2Z(plan, in, reinterpret_cast<cufftDoubleComplex*>(out));
 }
 // C2R
 inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<float>* in, float* out) {
  return cufftExecC2R(plan, reinterpret_cast<cufftComplex*>(in), out);
 }
 inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<double>* in, double* out) {
  return cufftExecZ2D(plan, reinterpret_cast<cufftDoubleComplex*>(in), out);
 }
 }  // namespace internal
 }  // namespace Eigen
 #endif  // EIGEN_GPU_CUFFT_SUPPORT_H
--- a/Eigen/src/GPU/CuSparseSupport.h
+++ b/Eigen/src/GPU/CuSparseSupport.h
@@ -0,0 +1,34 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // cuSPARSE support utilities: error checking macro.
 #ifndef EIGEN_GPU_CUSPARSE_SUPPORT_H
 #define EIGEN_GPU_CUSPARSE_SUPPORT_H
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 #include "./GpuSupport.h"
 #include <cusparse.h>
 namespace Eigen {
 namespace internal {
 #define EIGEN_CUSPARSE_CHECK(x)                                                 \
  do {                                                                          \
    cusparseStatus_t _s = (x);                                                  \
    eigen_assert(_s == CUSPARSE_STATUS_SUCCESS && "cuSPARSE call failed: " #x); \
    EIGEN_UNUSED_VARIABLE(_s);                                                  \
  } while (0)
 }  // namespace internal
 }  // namespace Eigen
 #endif  // EIGEN_GPU_CUSPARSE_SUPPORT_H
--- a/Eigen/src/GPU/GpuEigenSolver.h
+++ b/Eigen/src/GPU/GpuEigenSolver.h
@@ -116,6 +116,10 @@ class GpuSelfAdjointEigenSolver {
  Index cols() const { return n_; }
  Index rows() const { return n_; }
  // TODO: Add device-side accessors (deviceEigenvalues(), deviceEigenvectors())
  // returning DeviceMatrix views of the internal buffers, so users can chain
  // GPU operations without round-tripping through host memory.
  /** Eigenvalues in ascending order. Downloads from device. */
  RealVector eigenvalues() const {
    sync_info();
--- a/Eigen/src/GPU/GpuFFT.h
+++ b/Eigen/src/GPU/GpuFFT.h
@@ -0,0 +1,308 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // GPU FFT via cuFFT.
 //
 // Standalone GPU FFT class with plan caching. Supports 1D and 2D transforms:
 // C2C (complex-to-complex), R2C (real-to-complex), C2R (complex-to-real).
 //
 // Inverse transforms are scaled by 1/n (1D) or 1/(n*m) (2D) so that
 // inv(fwd(x)) == x, matching Eigen's FFT convention.
 //
 // cuFFT plans are cached by (size, type) and reused across calls.
 //
 // Usage:
 //   GpuFFT<float> fft;
 //   VectorXcf X = fft.fwd(x);         // 1D C2C or R2C
 //   VectorXcf y = fft.inv(X);         // 1D C2C inverse
 //   VectorXf  r = fft.invReal(X, n);  // 1D C2R inverse
 //   MatrixXcf B = fft.fwd2d(A);       // 2D C2C forward
 //   MatrixXcf C = fft.inv2d(B);       // 2D C2C inverse
 #ifndef EIGEN_GPU_FFT_H
 #define EIGEN_GPU_FFT_H
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 #include "./CuFftSupport.h"
 #include "./CuBlasSupport.h"
 #include <map>
 namespace Eigen {
 template <typename Scalar_>
 class GpuFFT {
 public:
  using Scalar = Scalar_;
  using Complex = std::complex<Scalar>;
  using ComplexVector = Matrix<Complex, Dynamic, 1>;
  using RealVector = Matrix<Scalar, Dynamic, 1>;
  using ComplexMatrix = Matrix<Complex, Dynamic, Dynamic, ColMajor>;
  GpuFFT() {
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
    EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
    EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
  }
  ~GpuFFT() {
    for (auto& kv : plans_) (void)cufftDestroy(kv.second);
    if (cublas_) (void)cublasDestroy(cublas_);
    if (stream_) (void)cudaStreamDestroy(stream_);
  }
  GpuFFT(const GpuFFT&) = delete;
  GpuFFT& operator=(const GpuFFT&) = delete;
  // ---- 1D Complex-to-Complex ------------------------------------------------
  /** Forward 1D C2C FFT. */
  template <typename Derived>
  ComplexVector fwd(const MatrixBase<Derived>& x,
                    typename std::enable_if<NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
    const ComplexVector input(x.derived());
    const int n = static_cast<int>(input.size());
    if (n == 0) return ComplexVector(0);
    ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
    EIGEN_CUDA_RUNTIME_CHECK(
        cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
    cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
    EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
                                                      static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
    ComplexVector result(n);
    EIGEN_CUDA_RUNTIME_CHECK(
        cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
    return result;
  }
  /** Inverse 1D C2C FFT. Scaled by 1/n. */
  template <typename Derived>
  ComplexVector inv(const MatrixBase<Derived>& X) {
    static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv() requires complex input");
    const ComplexVector input(X.derived());
    const int n = static_cast<int>(input.size());
    if (n == 0) return ComplexVector(0);
    ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
    EIGEN_CUDA_RUNTIME_CHECK(
        cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
    cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
    EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
                                                      static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
    // Scale by 1/n.
    scale_device(static_cast<Complex*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
    ComplexVector result(n);
    EIGEN_CUDA_RUNTIME_CHECK(
        cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
    return result;
  }
  // ---- 1D Real-to-Complex ---------------------------------------------------
  /** Forward 1D R2C FFT. Returns n/2+1 complex values (half-spectrum). */
  template <typename Derived>
  ComplexVector fwd(const MatrixBase<Derived>& x,
                    typename std::enable_if<!NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
    const RealVector input(x.derived());
    const int n = static_cast<int>(input.size());
    if (n == 0) return ComplexVector(0);
    const int n_complex = n / 2 + 1;
    ensure_buffers(n * sizeof(Scalar), n_complex * sizeof(Complex));
    EIGEN_CUDA_RUNTIME_CHECK(
        cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
    cufftHandle plan = get_plan_1d(n, internal::cufft_r2c_type<Scalar>::value);
    EIGEN_CUFFT_CHECK(
        internal::cufftExecR2C_dispatch(plan, static_cast<Scalar*>(d_in_.ptr), static_cast<Complex*>(d_out_.ptr)));
    ComplexVector result(n_complex);
    EIGEN_CUDA_RUNTIME_CHECK(
        cudaMemcpyAsync(result.data(), d_out_.ptr, n_complex * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
    return result;
  }
  // ---- 1D Complex-to-Real ---------------------------------------------------
  /** Inverse 1D C2R FFT. Input is n/2+1 complex values, output is nfft real values.
   * Scaled by 1/nfft. Caller must specify nfft (original real signal length). */
  template <typename Derived>
  RealVector invReal(const MatrixBase<Derived>& X, Index nfft) {
    static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "invReal() requires complex input");
    const ComplexVector input(X.derived());
    const int n = static_cast<int>(nfft);
    const int n_complex = n / 2 + 1;
    eigen_assert(input.size() == n_complex);
    if (n == 0) return RealVector(0);
    ensure_buffers(n_complex * sizeof(Complex), n * sizeof(Scalar));
    // cuFFT C2R may overwrite the input, so we copy to d_in_.
    EIGEN_CUDA_RUNTIME_CHECK(
        cudaMemcpyAsync(d_in_.ptr, input.data(), n_complex * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
    cufftHandle plan = get_plan_1d(n, internal::cufft_c2r_type<Scalar>::value);
    EIGEN_CUFFT_CHECK(
        internal::cufftExecC2R_dispatch(plan, static_cast<Complex*>(d_in_.ptr), static_cast<Scalar*>(d_out_.ptr)));
    // Scale by 1/n.
    scale_device_real(static_cast<Scalar*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
    RealVector result(n);
    EIGEN_CUDA_RUNTIME_CHECK(
        cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
    return result;
  }
  // ---- 2D Complex-to-Complex ------------------------------------------------
  /** Forward 2D C2C FFT. Input and output are rows x cols complex matrices. */
  template <typename Derived>
  ComplexMatrix fwd2d(const MatrixBase<Derived>& A) {
    static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "fwd2d() requires complex input");
    const ComplexMatrix input(A.derived());
    const int rows = static_cast<int>(input.rows());
    const int cols = static_cast<int>(input.cols());
    if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
    const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
    ensure_buffers(total, total);
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
    cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
    EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
                                                      static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
    ComplexMatrix result(rows, cols);
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
    return result;
  }
  /** Inverse 2D C2C FFT. Scaled by 1/(rows*cols). */
  template <typename Derived>
  ComplexMatrix inv2d(const MatrixBase<Derived>& A) {
    static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv2d() requires complex input");
    const ComplexMatrix input(A.derived());
    const int rows = static_cast<int>(input.rows());
    const int cols = static_cast<int>(input.cols());
    if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
    const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
    ensure_buffers(total, total);
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
    cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
    EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
                                                      static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
    // Scale by 1/(rows*cols).
    const int total_elems = rows * cols;
    scale_device(static_cast<Complex*>(d_out_.ptr), total_elems, Scalar(1) / Scalar(total_elems));
    ComplexMatrix result(rows, cols);
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
    return result;
  }
  // ---- Accessors ------------------------------------------------------------
  cudaStream_t stream() const { return stream_; }
 private:
  cudaStream_t stream_ = nullptr;
  cublasHandle_t cublas_ = nullptr;
  std::map<int64_t, cufftHandle> plans_;
  internal::DeviceBuffer d_in_;
  internal::DeviceBuffer d_out_;
  size_t d_in_size_ = 0;
  size_t d_out_size_ = 0;
  void ensure_buffers(size_t in_bytes, size_t out_bytes) {
    if (in_bytes > d_in_size_) {
      if (d_in_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
      d_in_ = internal::DeviceBuffer(in_bytes);
      d_in_size_ = in_bytes;
    }
    if (out_bytes > d_out_size_) {
      if (d_out_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
      d_out_ = internal::DeviceBuffer(out_bytes);
      d_out_size_ = out_bytes;
    }
  }
  // Plan key encoding: rank (1 bit) | type (4 bits) | dims
  static int64_t plan_key_1d(int n, cufftType type) { return (int64_t(n) << 5) | (int64_t(type) << 1) | 0; }
  static int64_t plan_key_2d(int rows, int cols, cufftType type) {
    return (int64_t(rows) << 35) | (int64_t(cols) << 5) | (int64_t(type) << 1) | 1;
  }
  cufftHandle get_plan_1d(int n, cufftType type) {
    int64_t key = plan_key_1d(n, type);
    auto it = plans_.find(key);
    if (it != plans_.end()) return it->second;
    cufftHandle plan;
    EIGEN_CUFFT_CHECK(cufftPlan1d(&plan, n, type, /*batch=*/1));
    EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
    plans_[key] = plan;
    return plan;
  }
  cufftHandle get_plan_2d(int rows, int cols, cufftType type) {
    int64_t key = plan_key_2d(rows, cols, type);
    auto it = plans_.find(key);
    if (it != plans_.end()) return it->second;
    // cuFFT uses row-major (C order) for 2D: first dim = rows, second = cols.
    // Eigen matrices are column-major, so we pass (cols, rows) to cuFFT
    // to get the correct 2D transform.
    cufftHandle plan;
    EIGEN_CUFFT_CHECK(cufftPlan2d(&plan, cols, rows, type));
    EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
    plans_[key] = plan;
    return plan;
  }
  // Scale complex array on device using cuBLAS scal.
  void scale_device(Complex* d_ptr, int n, Scalar alpha) { scale_complex(cublas_, d_ptr, n, alpha); }
  // Scale real array on device using cuBLAS scal.
  void scale_device_real(Scalar* d_ptr, int n, Scalar alpha) { scale_real(cublas_, d_ptr, n, alpha); }
  // Type-dispatched cuBLAS scal wrappers (C++14 compatible).
  static void scale_complex(cublasHandle_t h, std::complex<float>* p, int n, float a) {
    EIGEN_CUBLAS_CHECK(cublasCsscal(h, n, &a, reinterpret_cast<cuComplex*>(p), 1));
  }
  static void scale_complex(cublasHandle_t h, std::complex<double>* p, int n, double a) {
    EIGEN_CUBLAS_CHECK(cublasZdscal(h, n, &a, reinterpret_cast<cuDoubleComplex*>(p), 1));
  }
  static void scale_real(cublasHandle_t h, float* p, int n, float a) {
    EIGEN_CUBLAS_CHECK(cublasSscal(h, n, &a, p, 1));
  }
  static void scale_real(cublasHandle_t h, double* p, int n, double a) {
    EIGEN_CUBLAS_CHECK(cublasDscal(h, n, &a, p, 1));
  }
 };
 }  // namespace Eigen
 #endif  // EIGEN_GPU_FFT_H
--- a/Eigen/src/GPU/GpuQR.h
+++ b/Eigen/src/GPU/GpuQR.h
@@ -179,7 +179,10 @@ class GpuQR {
  /** Solve A * X = B via QR: X = R^{-1} * Q^H * B (least-squares for m >= n).
   * Uses ormqr (apply Q^H) + trsm (solve R), without forming Q explicitly.
-   * Requires m >= n (overdetermined or square). Underdetermined not supported. */
+   * Requires m >= n (overdetermined or square). Underdetermined not supported.
   *
   * TODO: Add device-side accessor for the R factor (and Q application) as
   * DeviceMatrix, so users can chain GPU operations without host round-trips. */
  template <typename Rhs>
  PlainMatrix solve(const MatrixBase<Rhs>& B) const {
    sync_info();
--- a/Eigen/src/GPU/GpuSVD.h
+++ b/Eigen/src/GPU/GpuSVD.h
@@ -143,6 +143,10 @@ class GpuSVD {
  Index rows() const { return transposed_ ? n_ : m_; }
  Index cols() const { return transposed_ ? m_ : n_; }
  // TODO: Add device-side accessors (deviceU(), deviceVT(), deviceSingularValues())
  // returning DeviceMatrix views of the internal buffers, so users can chain
  // GPU operations without round-tripping through host memory.
  /** Singular values (always available). Downloads from device on each call. */
  RealVector singularValues() const {
    sync_info();
--- a/Eigen/src/GPU/GpuSparseContext.h
+++ b/Eigen/src/GPU/GpuSparseContext.h
@@ -0,0 +1,321 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix
 // multiply (SpMM) via cuSPARSE.
 //
 // GpuSparseContext manages a cuSPARSE handle and device buffers. It accepts
 // Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the
 // GPU. RowMajor input is implicitly converted to ColMajor.
 //
 // Usage:
 //   GpuSparseContext<double> ctx;
 //   VectorXd y = ctx.multiply(A, x);           // y = A * x
 //   ctx.multiply(A, x, y, 2.0, 1.0);           // y = 2*A*x + y
 //   VectorXd z = ctx.multiplyT(A, x);          // z = A^T * x
 //   MatrixXd Y = ctx.multiplyMat(A, X);        // Y = A * X (multiple RHS)
 #ifndef EIGEN_GPU_SPARSE_CONTEXT_H
 #define EIGEN_GPU_SPARSE_CONTEXT_H
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 #include "./CuSparseSupport.h"
 namespace Eigen {
 template <typename Scalar_>
 class GpuSparseContext {
 public:
  using Scalar = Scalar_;
  using RealScalar = typename NumTraits<Scalar>::Real;
  using StorageIndex = int;
  using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
  using DenseVector = Matrix<Scalar, Dynamic, 1>;
  using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
  GpuSparseContext() {
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
    EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_));
    EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_));
  }
  ~GpuSparseContext() {
    destroy_descriptors();
    if (handle_) (void)cusparseDestroy(handle_);
    if (stream_) (void)cudaStreamDestroy(stream_);
  }
  GpuSparseContext(const GpuSparseContext&) = delete;
  GpuSparseContext& operator=(const GpuSparseContext&) = delete;
  // ---- SpMV: y = A * x -----------------------------------------------------
  /** Compute y = A * x. Returns y as a new dense vector. */
  template <typename InputType, typename Rhs>
  DenseVector multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
    const SpMat mat(A.derived());
    DenseVector y(mat.rows());
    y.setZero();
    multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
    return y;
  }
  /** Compute y = alpha * op(A) * x + beta * y (in-place). */
  template <typename InputType, typename Rhs, typename Dest>
  void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y,
                Scalar alpha = Scalar(1), Scalar beta = Scalar(0),
                cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
    const SpMat mat(A.derived());
    multiply_impl(mat, x.derived(), y.derived(), alpha, beta, op);
  }
  // ---- SpMV transpose: y = A^T * x -----------------------------------------
  /** Compute y = A^T * x. Returns y as a new dense vector. */
  template <typename InputType, typename Rhs>
  DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
    const SpMat mat(A.derived());
    DenseVector y(mat.cols());
    y.setZero();
    multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
    return y;
  }
  // ---- SpMM: Y = A * X (multiple RHS) --------------------------------------
  /** Compute Y = A * X where X is a dense matrix (multiple RHS). Returns Y. */
  template <typename InputType, typename Rhs>
  DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X) {
    const SpMat mat(A.derived());
    const DenseMatrix rhs(X.derived());
    eigen_assert(mat.cols() == rhs.rows());
    const Index m = mat.rows();
    const Index n = rhs.cols();
    if (m == 0 || n == 0 || mat.nonZeros() == 0) return DenseMatrix::Zero(m, n);
    DenseMatrix Y = DenseMatrix::Zero(m, n);
    spmm_impl(mat, rhs, Y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
    return Y;
  }
  // ---- Accessors ------------------------------------------------------------
  cudaStream_t stream() const { return stream_; }
 private:
  cudaStream_t stream_ = nullptr;
  cusparseHandle_t handle_ = nullptr;
  // Cached device buffers (grow-only).
  internal::DeviceBuffer d_outerPtr_;
  internal::DeviceBuffer d_innerIdx_;
  internal::DeviceBuffer d_values_;
  internal::DeviceBuffer d_x_;
  internal::DeviceBuffer d_y_;
  internal::DeviceBuffer d_workspace_;
  size_t d_outerPtr_size_ = 0;
  size_t d_innerIdx_size_ = 0;
  size_t d_values_size_ = 0;
  size_t d_x_size_ = 0;
  size_t d_y_size_ = 0;
  size_t d_workspace_size_ = 0;
  // Cached cuSPARSE descriptors.
  cusparseSpMatDescr_t spmat_desc_ = nullptr;
  Index cached_rows_ = -1;
  Index cached_cols_ = -1;
  Index cached_nnz_ = -1;
  // ---- SpMV implementation --------------------------------------------------
  template <typename RhsDerived, typename DestDerived>
  void multiply_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
                     cusparseOperation_t op) {
    eigen_assert(A.isCompressed());
    const Index m = A.rows();
    const Index n = A.cols();
    const Index nnz = A.nonZeros();
    const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
    const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
    eigen_assert(x.size() == x_size);
    eigen_assert(y.size() == y_size);
    if (m == 0 || n == 0 || nnz == 0) {
      if (beta == Scalar(0))
        y.setZero();
      else
        y *= beta;
      return;
    }
    // Upload sparse matrix to device.
    upload_sparse(A);
    // Upload x to device.
    ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar));
    const DenseVector x_tmp(x);
    EIGEN_CUDA_RUNTIME_CHECK(
        cudaMemcpyAsync(d_x_.ptr, x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
    // Upload y to device (for beta != 0).
    ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar));
    if (beta != Scalar(0)) {
      const DenseVector y_tmp(y);
      EIGEN_CUDA_RUNTIME_CHECK(
          cudaMemcpyAsync(d_y_.ptr, y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
    }
    // Create dense vector descriptors.
    constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
    cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr;
    EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_.ptr, dtype));
    EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_.ptr, dtype));
    // Query workspace size.
    size_t ws_size = 0;
    EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
                                                 CUSPARSE_SPMV_ALG_DEFAULT, &ws_size));
    ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
    // Execute SpMV.
    EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
                                      CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.ptr));
    // Download result.
    EIGEN_CUDA_RUNTIME_CHECK(
        cudaMemcpyAsync(y.data(), d_y_.ptr, y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
    (void)cusparseDestroyDnVec(x_desc);
    (void)cusparseDestroyDnVec(y_desc);
  }
  // ---- SpMM implementation --------------------------------------------------
  void spmm_impl(const SpMat& A, const DenseMatrix& X, DenseMatrix& Y, Scalar alpha, Scalar beta,
                 cusparseOperation_t op) {
    eigen_assert(A.isCompressed());
    const Index m = A.rows();
    const Index n = X.cols();
    const Index k = A.cols();
    const Index nnz = A.nonZeros();
    if (m == 0 || n == 0 || k == 0 || nnz == 0) {
      if (beta == Scalar(0))
        Y.setZero();
      else
        Y *= beta;
      return;
    }
    upload_sparse(A);
    // Upload X to device.
    const size_t x_bytes = static_cast<size_t>(k) * static_cast<size_t>(n) * sizeof(Scalar);
    const size_t y_bytes = static_cast<size_t>(m) * static_cast<size_t>(n) * sizeof(Scalar);
    ensure_buffer(d_x_, d_x_size_, x_bytes);
    ensure_buffer(d_y_, d_y_size_, y_bytes);
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_x_.ptr, X.data(), x_bytes, cudaMemcpyHostToDevice, stream_));
    if (beta != Scalar(0)) {
      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.ptr, Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_));
    }
    // Create dense matrix descriptors.
    constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
    cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr;
    // Eigen is column-major, so ld = rows.
    EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k, n, k, d_x_.ptr, dtype, CUSPARSE_ORDER_COL));
    EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m, n, m, d_y_.ptr, dtype, CUSPARSE_ORDER_COL));
    // Query workspace.
    size_t ws_size = 0;
    EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_,
                                                 x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size));
    ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
    // Execute SpMM.
    EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta,
                                      y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.ptr));
    // Download result.
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.ptr, y_bytes, cudaMemcpyDeviceToHost, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
    (void)cusparseDestroyDnMat(x_desc);
    (void)cusparseDestroyDnMat(y_desc);
  }
  // ---- Helpers --------------------------------------------------------------
  void upload_sparse(const SpMat& A) {
    const Index m = A.rows();
    const Index n = A.cols();
    const Index nnz = A.nonZeros();
    const size_t outer_bytes = static_cast<size_t>(n + 1) * sizeof(StorageIndex);
    const size_t inner_bytes = static_cast<size_t>(nnz) * sizeof(StorageIndex);
    const size_t val_bytes = static_cast<size_t>(nnz) * sizeof(Scalar);
    ensure_buffer(d_outerPtr_, d_outerPtr_size_, outer_bytes);
    ensure_buffer(d_innerIdx_, d_innerIdx_size_, inner_bytes);
    ensure_buffer(d_values_, d_values_size_, val_bytes);
    EIGEN_CUDA_RUNTIME_CHECK(
        cudaMemcpyAsync(d_outerPtr_.ptr, A.outerIndexPtr(), outer_bytes, cudaMemcpyHostToDevice, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(
        cudaMemcpyAsync(d_innerIdx_.ptr, A.innerIndexPtr(), inner_bytes, cudaMemcpyHostToDevice, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, A.valuePtr(), val_bytes, cudaMemcpyHostToDevice, stream_));
    // Recreate descriptor if shape changed.
    if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) {
      destroy_descriptors();
      constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
      constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value;
      // ColMajor → CSC. outerIndexPtr = col offsets, innerIndexPtr = row indices.
      EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr,
                                             idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO, val_type));
      cached_rows_ = m;
      cached_cols_ = n;
      cached_nnz_ = nnz;
    } else {
      // Same shape — just update pointers.
      EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr));
    }
  }
  void destroy_descriptors() {
    if (spmat_desc_) {
      (void)cusparseDestroySpMat(spmat_desc_);
      spmat_desc_ = nullptr;
    }
    cached_rows_ = -1;
    cached_cols_ = -1;
    cached_nnz_ = -1;
  }
  void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) {
    if (needed > current_size) {
      if (buf.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
      buf = internal::DeviceBuffer(needed);
      current_size = needed;
    }
  }
 };
 }  // namespace Eigen
 #endif  // EIGEN_GPU_SPARSE_CONTEXT_H
--- a/Eigen/src/GPU/GpuSparseLDLT.h
+++ b/Eigen/src/GPU/GpuSparseLDLT.h
@@ -0,0 +1,62 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // GPU sparse LDL^T / LDL^H factorization via cuDSS.
 //
 // For symmetric indefinite (or Hermitian indefinite) sparse matrices.
 // Same three-phase workflow as GpuSparseLLT.
 //
 // Usage:
 //   GpuSparseLDLT<double> ldlt(A);      // analyze + factorize
 //   VectorXd x = ldlt.solve(b);         // solve
 #ifndef EIGEN_GPU_SPARSE_LDLT_H
 #define EIGEN_GPU_SPARSE_LDLT_H
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 #include "./GpuSparseSolverBase.h"
 namespace Eigen {
 /** GPU sparse LDL^T factorization (symmetric indefinite / Hermitian indefinite).
 *
 * Wraps cuDSS with CUDSS_MTYPE_SYMMETRIC (real) or CUDSS_MTYPE_HERMITIAN (complex).
 * Uses pivoting for numerical stability.
 *
 * \tparam Scalar_  float, double, complex<float>, or complex<double>
 * \tparam UpLo_    Lower (default) or Upper — which triangle of A is stored
 */
 template <typename Scalar_, int UpLo_ = Lower>
 class GpuSparseLDLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT<Scalar_, UpLo_>> {
  using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT>;
  friend Base;
 public:
  using Scalar = Scalar_;
  enum { UpLo = UpLo_ };
  GpuSparseLDLT() = default;
  template <typename InputType>
  explicit GpuSparseLDLT(const SparseMatrixBase<InputType>& A) {
    this->compute(A);
  }
  static constexpr bool needs_csr_conversion() { return false; }
  static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_symmetric_type<Scalar>::value; }
  static constexpr cudssMatrixViewType_t cudss_matrix_view() {
    return internal::cudss_view_type<UpLo, ColMajor>::value;
  }
 };
 }  // namespace Eigen
 #endif  // EIGEN_GPU_SPARSE_LDLT_H
--- a/Eigen/src/GPU/GpuSparseLLT.h
+++ b/Eigen/src/GPU/GpuSparseLLT.h
@@ -0,0 +1,62 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // GPU sparse Cholesky (LL^T / LL^H) via cuDSS.
 //
 // Usage:
 //   GpuSparseLLT<double> llt(A);        // analyze + factorize
 //   VectorXd x = llt.solve(b);          // solve
 //   llt.analyzePattern(A);              // or separate phases
 //   llt.factorize(A_new);              // reuse symbolic analysis
 #ifndef EIGEN_GPU_SPARSE_LLT_H
 #define EIGEN_GPU_SPARSE_LLT_H
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 #include "./GpuSparseSolverBase.h"
 namespace Eigen {
 /** GPU sparse Cholesky factorization (LL^T for real, LL^H for complex).
 *
 * Wraps cuDSS with CUDSS_MTYPE_SPD (real) or CUDSS_MTYPE_HPD (complex).
 * Accepts ColMajor SparseMatrix (CSC), reinterpreted as CSR with swapped
 * triangle view for zero-copy upload.
 *
 * \tparam Scalar_  float, double, complex<float>, or complex<double>
 * \tparam UpLo_    Lower (default) or Upper — which triangle of A is stored
 */
 template <typename Scalar_, int UpLo_ = Lower>
 class GpuSparseLLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT<Scalar_, UpLo_>> {
  using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT>;
  friend Base;
 public:
  using Scalar = Scalar_;
  enum { UpLo = UpLo_ };
  GpuSparseLLT() = default;
  template <typename InputType>
  explicit GpuSparseLLT(const SparseMatrixBase<InputType>& A) {
    this->compute(A);
  }
  static constexpr bool needs_csr_conversion() { return false; }
  static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_spd_type<Scalar>::value; }
  static constexpr cudssMatrixViewType_t cudss_matrix_view() {
    return internal::cudss_view_type<UpLo, ColMajor>::value;
  }
 };
 }  // namespace Eigen
 #endif  // EIGEN_GPU_SPARSE_LLT_H
--- a/Eigen/src/GPU/GpuSparseLU.h
+++ b/Eigen/src/GPU/GpuSparseLU.h
@@ -0,0 +1,59 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // GPU sparse LU factorization via cuDSS.
 //
 // For general (non-symmetric) sparse matrices. Uses pivoting.
 // Same three-phase workflow as GpuSparseLLT.
 //
 // Usage:
 //   GpuSparseLU<double> lu(A);          // analyze + factorize
 //   VectorXd x = lu.solve(b);           // solve
 #ifndef EIGEN_GPU_SPARSE_LU_H
 #define EIGEN_GPU_SPARSE_LU_H
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 #include "./GpuSparseSolverBase.h"
 namespace Eigen {
 /** GPU sparse LU factorization (general matrices).
 *
 * Wraps cuDSS with CUDSS_MTYPE_GENERAL and CUDSS_MVIEW_FULL.
 * Accepts ColMajor SparseMatrix (CSC); internally converts to RowMajor
 * CSR since cuDSS requires CSR input.
 *
 * \tparam Scalar_  float, double, complex<float>, or complex<double>
 */
 template <typename Scalar_>
 class GpuSparseLU : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLU<Scalar_>> {
  using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLU>;
  friend Base;
 public:
  using Scalar = Scalar_;
  GpuSparseLU() = default;
  template <typename InputType>
  explicit GpuSparseLU(const SparseMatrixBase<InputType>& A) {
    this->compute(A);
  }
  static constexpr bool needs_csr_conversion() { return true; }
  static constexpr cudssMatrixType_t cudss_matrix_type() { return CUDSS_MTYPE_GENERAL; }
  static constexpr cudssMatrixViewType_t cudss_matrix_view() { return CUDSS_MVIEW_FULL; }
 };
 }  // namespace Eigen
 #endif  // EIGEN_GPU_SPARSE_LU_H
--- a/Eigen/src/GPU/GpuSparseSolverBase.h
+++ b/Eigen/src/GPU/GpuSparseSolverBase.h
@@ -0,0 +1,356 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // Common base for GPU sparse direct solvers (LLT, LDLT, LU) via cuDSS.
 //
 // All three solver types share the same three-phase workflow
 // (analyzePattern → factorize → solve) and differ only in the
 // cudssMatrixType_t and cudssMatrixViewType_t passed to cuDSS.
 // This CRTP base implements the entire workflow; derived classes
 // provide the matrix type/view via static constexpr members.
 #ifndef EIGEN_GPU_SPARSE_SOLVER_BASE_H
 #define EIGEN_GPU_SPARSE_SOLVER_BASE_H
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 #include "./CuDssSupport.h"
 namespace Eigen {
 namespace internal {
 /** CRTP base for GPU sparse direct solvers.
 *
 * \tparam Scalar_  Element type (passed explicitly to avoid incomplete-type issues with CRTP).
 * \tparam Derived  The concrete solver class (GpuSparseLLT, GpuSparseLDLT, GpuSparseLU).
 *                  Must provide:
 *                  - `static constexpr cudssMatrixType_t cudss_matrix_type()`
 *                  - `static constexpr cudssMatrixViewType_t cudss_matrix_view()`
 */
 template <typename Scalar_, typename Derived>
 class GpuSparseSolverBase {
 public:
  using Scalar = Scalar_;
  using RealScalar = typename NumTraits<Scalar>::Real;
  using StorageIndex = int;
  using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
  using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>;
  using DenseVector = Matrix<Scalar, Dynamic, 1>;
  using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
  GpuSparseSolverBase() { init_context(); }
  ~GpuSparseSolverBase() {
    destroy_cudss_objects();
    if (handle_) (void)cudssDestroy(handle_);
    if (stream_) (void)cudaStreamDestroy(stream_);
  }
  GpuSparseSolverBase(const GpuSparseSolverBase&) = delete;
  GpuSparseSolverBase& operator=(const GpuSparseSolverBase&) = delete;
  // ---- Configuration --------------------------------------------------------
  /** Set the fill-reducing ordering algorithm. Must be called before compute/analyzePattern. */
  void setOrdering(GpuSparseOrdering ordering) { ordering_ = ordering; }
  // ---- Factorization --------------------------------------------------------
  /** Symbolic analysis + numeric factorization. */
  template <typename InputType>
  Derived& compute(const SparseMatrixBase<InputType>& A) {
    analyzePattern(A);
    if (info_ == Success) {
      factorize(A);
    }
    return derived();
  }
  /** Symbolic analysis only. Uploads sparsity structure to device.
   * This phase is synchronous (blocks until complete). */
  template <typename InputType>
  Derived& analyzePattern(const SparseMatrixBase<InputType>& A) {
    const SpMat csc(A.derived());
    eigen_assert(csc.rows() == csc.cols() && "GpuSparseSolver requires a square matrix");
    eigen_assert(csc.isCompressed() && "GpuSparseSolver requires a compressed sparse matrix");
    n_ = csc.rows();
    info_ = InvalidInput;
    analysis_done_ = false;
    if (n_ == 0) {
      nnz_ = 0;
      info_ = Success;
      analysis_done_ = true;
      return derived();
    }
    // For symmetric solvers, ColMajor CSC can be reinterpreted as CSR with
    // swapped triangle view (zero copy). For general solvers, we must convert
    // to actual RowMajor CSR so cuDSS sees the correct matrix, not A^T.
    if (Derived::needs_csr_conversion()) {
      const CsrMat csr(csc);
      nnz_ = csr.nonZeros();
      upload_csr(csr);
    } else {
      nnz_ = csc.nonZeros();
      upload_csr_from_csc(csc);
    }
    create_cudss_matrix();
    apply_ordering_config();
    if (data_) EIGEN_CUDSS_CHECK(cudssDataDestroy(handle_, data_));
    EIGEN_CUDSS_CHECK(cudssDataCreate(handle_, &data_));
    create_placeholder_dense();
    EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_ANALYSIS, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
    analysis_done_ = true;
    info_ = Success;
    return derived();
  }
  /** Numeric factorization using the symbolic analysis from analyzePattern.
   *
   * \warning The sparsity pattern (outerIndexPtr, innerIndexPtr) must be
   * identical to the one passed to analyzePattern(). Only the numerical
   * values may change. Passing a different pattern is undefined behavior.
   * This matches the contract of CHOLMOD, UMFPACK, and cuDSS's own API.
   *
   * This phase is asynchronous — info() lazily synchronizes. */
  template <typename InputType>
  Derived& factorize(const SparseMatrixBase<InputType>& A) {
    eigen_assert(analysis_done_ && "factorize() requires analyzePattern() first");
    if (n_ == 0) {
      info_ = Success;
      return derived();
    }
    // Convert to the same format used in analyzePattern.
    // Both temporaries must outlive the async memcpy (pageable H2D is actually
    // synchronous w.r.t. the host, but keep them alive for clarity).
    const SpMat csc(A.derived());
    eigen_assert(csc.rows() == n_ && csc.cols() == n_);
    const Scalar* value_ptr;
    Index value_nnz;
    CsrMat csr_tmp;
    if (Derived::needs_csr_conversion()) {
      csr_tmp = CsrMat(csc);
      value_ptr = csr_tmp.valuePtr();
      value_nnz = csr_tmp.nonZeros();
    } else {
      value_ptr = csc.valuePtr();
      value_nnz = csc.nonZeros();
    }
    eigen_assert(value_nnz == nnz_);
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, value_ptr, static_cast<size_t>(nnz_) * sizeof(Scalar),
                                             cudaMemcpyHostToDevice, stream_));
    EIGEN_CUDSS_CHECK(cudssMatrixSetValues(d_A_cudss_, d_values_.ptr));
    info_ = InvalidInput;
    info_synced_ = false;
    EIGEN_CUDSS_CHECK(
        cudssExecute(handle_, CUDSS_PHASE_FACTORIZATION, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
    return derived();
  }
  // ---- Solve ----------------------------------------------------------------
  /** Solve A * X = B. Returns X as a dense matrix.
   * Supports single or multiple right-hand sides. */
  template <typename Rhs>
  DenseMatrix solve(const MatrixBase<Rhs>& B) const {
    sync_info();
    eigen_assert(info_ == Success && "GpuSparseSolver::solve requires a successful factorization");
    eigen_assert(B.rows() == n_);
    const DenseMatrix rhs(B);
    const int64_t nrhs = static_cast<int64_t>(rhs.cols());
    if (n_ == 0) return DenseMatrix(0, rhs.cols());
    const size_t rhs_bytes = static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar);
    DeviceBuffer d_b(rhs_bytes);
    DeviceBuffer d_x(rhs_bytes);
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_b.ptr, rhs.data(), rhs_bytes, cudaMemcpyHostToDevice, stream_));
    constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
    cudssMatrix_t b_cudss = nullptr, x_cudss = nullptr;
    EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&b_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_b.ptr,
                                          dtype, CUDSS_LAYOUT_COL_MAJOR));
    EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&x_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_x.ptr,
                                          dtype, CUDSS_LAYOUT_COL_MAJOR));
    EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_SOLVE, config_, data_, d_A_cudss_, x_cudss, b_cudss));
    DenseMatrix X(n_, rhs.cols());
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_x.ptr, rhs_bytes, cudaMemcpyDeviceToHost, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
    (void)cudssMatrixDestroy(b_cudss);
    (void)cudssMatrixDestroy(x_cudss);
    return X;
  }
  // ---- Accessors ------------------------------------------------------------
  ComputationInfo info() const {
    sync_info();
    return info_;
  }
  Index rows() const { return n_; }
  Index cols() const { return n_; }
  cudaStream_t stream() const { return stream_; }
 protected:
  // ---- CUDA / cuDSS handles -------------------------------------------------
  cudaStream_t stream_ = nullptr;
  cudssHandle_t handle_ = nullptr;
  cudssConfig_t config_ = nullptr;
  cudssData_t data_ = nullptr;
  cudssMatrix_t d_A_cudss_ = nullptr;
  cudssMatrix_t d_x_cudss_ = nullptr;
  cudssMatrix_t d_b_cudss_ = nullptr;
  // ---- Device buffers for CSR arrays ----------------------------------------
  DeviceBuffer d_rowPtr_;
  DeviceBuffer d_colIdx_;
  DeviceBuffer d_values_;
  // ---- State ----------------------------------------------------------------
  Index n_ = 0;
  Index nnz_ = 0;
  ComputationInfo info_ = InvalidInput;
  bool info_synced_ = true;
  bool analysis_done_ = false;
  GpuSparseOrdering ordering_ = GpuSparseOrdering::AMD;
 private:
  Derived& derived() { return static_cast<Derived&>(*this); }
  const Derived& derived() const { return static_cast<const Derived&>(*this); }
  void init_context() {
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
    EIGEN_CUDSS_CHECK(cudssCreate(&handle_));
    EIGEN_CUDSS_CHECK(cudssSetStream(handle_, stream_));
    EIGEN_CUDSS_CHECK(cudssConfigCreate(&config_));
  }
  void sync_info() const {
    if (!info_synced_) {
      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
      int cudss_info = 0;
      EIGEN_CUDSS_CHECK(cudssDataGet(handle_, data_, CUDSS_DATA_INFO, &cudss_info, sizeof(cudss_info), nullptr));
      auto* self = const_cast<GpuSparseSolverBase*>(this);
      self->info_ = (cudss_info == 0) ? Success : NumericalIssue;
      self->info_synced_ = true;
    }
  }
  void destroy_cudss_objects() {
    if (d_A_cudss_) {
      (void)cudssMatrixDestroy(d_A_cudss_);
      d_A_cudss_ = nullptr;
    }
    if (d_x_cudss_) {
      (void)cudssMatrixDestroy(d_x_cudss_);
      d_x_cudss_ = nullptr;
    }
    if (d_b_cudss_) {
      (void)cudssMatrixDestroy(d_b_cudss_);
      d_b_cudss_ = nullptr;
    }
    if (data_) {
      (void)cudssDataDestroy(handle_, data_);
      data_ = nullptr;
    }
    if (config_) {
      (void)cudssConfigDestroy(config_);
      config_ = nullptr;
    }
  }
  // Upload CSR from a RowMajor sparse matrix (native CSR).
  void upload_csr(const CsrMat& csr) { upload_compressed(csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr()); }
  // Upload CSC arrays reinterpreted as CSR (for symmetric matrices: CSC(A) = CSR(A^T) = CSR(A)).
  void upload_csr_from_csc(const SpMat& csc) {
    upload_compressed(csc.outerIndexPtr(), csc.innerIndexPtr(), csc.valuePtr());
  }
  void upload_compressed(const StorageIndex* outer, const StorageIndex* inner, const Scalar* values) {
    const size_t rowptr_bytes = static_cast<size_t>(n_ + 1) * sizeof(StorageIndex);
    const size_t colidx_bytes = static_cast<size_t>(nnz_) * sizeof(StorageIndex);
    const size_t values_bytes = static_cast<size_t>(nnz_) * sizeof(Scalar);
    d_rowPtr_ = DeviceBuffer(rowptr_bytes);
    d_colIdx_ = DeviceBuffer(colidx_bytes);
    d_values_ = DeviceBuffer(values_bytes);
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_rowPtr_.ptr, outer, rowptr_bytes, cudaMemcpyHostToDevice, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_colIdx_.ptr, inner, colidx_bytes, cudaMemcpyHostToDevice, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, values, values_bytes, cudaMemcpyHostToDevice, stream_));
  }
  void create_cudss_matrix() {
    if (d_A_cudss_) (void)cudssMatrixDestroy(d_A_cudss_);
    constexpr cudaDataType_t idx_type = cudss_index_type<StorageIndex>::value;
    constexpr cudaDataType_t val_type = cuda_data_type<Scalar>::value;
    constexpr cudssMatrixType_t mtype = Derived::cudss_matrix_type();
    constexpr cudssMatrixViewType_t mview = Derived::cudss_matrix_view();
    EIGEN_CUDSS_CHECK(cudssMatrixCreateCsr(
        &d_A_cudss_, static_cast<int64_t>(n_), static_cast<int64_t>(n_), static_cast<int64_t>(nnz_), d_rowPtr_.ptr,
        /*rowEnd=*/nullptr, d_colIdx_.ptr, d_values_.ptr, idx_type, val_type, mtype, mview, CUDSS_BASE_ZERO));
  }
  void apply_ordering_config() {
    cudssAlgType_t alg;
    switch (ordering_) {
      case GpuSparseOrdering::AMD:
        alg = CUDSS_ALG_DEFAULT;
        break;
      case GpuSparseOrdering::METIS:
        alg = CUDSS_ALG_2;
        break;
      case GpuSparseOrdering::RCM:
        alg = CUDSS_ALG_3;
        break;
      default:
        alg = CUDSS_ALG_DEFAULT;
        break;
    }
    EIGEN_CUDSS_CHECK(cudssConfigSet(config_, CUDSS_CONFIG_REORDERING_ALG, &alg, sizeof(alg)));
  }
  void create_placeholder_dense() {
    if (d_x_cudss_) (void)cudssMatrixDestroy(d_x_cudss_);
    if (d_b_cudss_) (void)cudssMatrixDestroy(d_b_cudss_);
    constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
    EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_x_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
                                          dtype, CUDSS_LAYOUT_COL_MAJOR));
    EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_b_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
                                          dtype, CUDSS_LAYOUT_COL_MAJOR));
  }
 };
 }  // namespace internal
 }  // namespace Eigen
 #endif  // EIGEN_GPU_SPARSE_SOLVER_BASE_H
--- a/Eigen/src/GPU/README.md
+++ b/Eigen/src/GPU/README.md
@@ -1,8 +1,8 @@
 # Eigen GPU Module (`Eigen/GPU`)
-GPU-accelerated dense linear algebra for Eigen users, dispatching to NVIDIA
+GPU-accelerated linear algebra for Eigen users, dispatching to NVIDIA CUDA
-CUDA libraries (cuBLAS, cuSOLVER). Requires CUDA 11.4+. Header-only (link
+libraries (cuBLAS, cuSOLVER, cuFFT, cuSPARSE, cuDSS). Requires CUDA 11.4+;
-against CUDA runtime, cuBLAS, and cuSOLVER).
+cuDSS features require CUDA 12.0+ and a separate cuDSS install. Header-only.
 ## Why this module
@@ -10,25 +10,31 @@ Eigen is the linear algebra foundation for a large ecosystem of C++ projects
 in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP,
 Open3D), scientific computing (Ceres, Stan), and beyond. Many of these
 projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations
-without dropping down to raw CUDA library APIs. Third-party projects like
+without dropping down to raw CUDA library APIs.
 [EigenCuda](https://github.com/NLESC-JCER/EigenCuda) and
 [cholespy](https://github.com/rgl-epfl/cholespy) exist specifically to fill
 this gap, and downstream projects like
 [Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
 [COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
 GPU-accelerated solvers through Eigen.
-The `Eigen/GPU` module aims to close this gap: Existing Eigen users should be
+GPU sparse solvers are a particularly acute gap. Sparse factorization is the
-able to move performance-critical dense linear algebra to the GPU with minimal
+bottleneck in SLAM, bundle adjustment, FEM, and nonlinear optimization --
-code changes and without learning CUDA library APIs directly.
+exactly the workloads where GPU acceleration matters most. Downstream projects
 like [Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
 [COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
 GPU-accelerated sparse solvers, and third-party projects like
 [cholespy](https://github.com/rgl-epfl/cholespy) exist specifically because
 Eigen lacks them. The `Eigen/GPU` module provides GPU sparse Cholesky, LDL^T,
 and LU factorization via cuDSS, alongside dense solvers (cuSOLVER), matrix
 products (cuBLAS), FFT (cuFFT), and sparse matrix-vector products (cuSPARSE).
 Existing Eigen users should be able to move performance-critical dense or
 sparse linear algebra to the GPU with minimal code changes and without
 learning CUDA library APIs directly.
 ## Design philosophy
 **CPU and GPU coexist.** There is no global compile-time switch that replaces
 CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers
-explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>` -- and both coexist in
+explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>`, `GpuSparseLLT<double>` vs
-the same binary. This also lets users keep the factored matrix on device across
+`SimplicialLLT<SparseMatrix<double>>` -- and both coexist in the same binary.
-multiple solves, something impossible with compile-time replacement.
+This also lets users keep the factored matrix on device across multiple solves,
 something impossible with compile-time replacement.
 **Familiar syntax.** GPU operations use the same expression patterns as CPU
 Eigen. Here is a side-by-side comparison:
@@ -38,6 +44,7 @@ Eigen. Here is a side-by-side comparison:
 #include <Eigen/Dense>                  #define EIGEN_USE_GPU
                                        #include <Eigen/GPU>
 // Dense
 MatrixXd A = ...;                       auto d_A = DeviceMatrix<double>::fromHost(A);
 MatrixXd B = ...;                       auto d_B = DeviceMatrix<double>::fromHost(B);
@@ -45,11 +52,15 @@ MatrixXd C = A * B;                     DeviceMatrix<double> d_C = d_A * d_B;
 MatrixXd X = A.llt().solve(B);          DeviceMatrix<double> d_X = d_A.llt().solve(d_B);
                                        MatrixXd X = d_X.toHost();
 // Sparse (using SpMat = SparseMatrix<double>)
 SimplicialLLT<SpMat> llt(A);            GpuSparseLLT<double> llt(A);
 VectorXd x = llt.solve(b);              VectorXd x = llt.solve(b);
 ```
-The GPU version reads like CPU Eigen with explicit upload/download.
+The GPU version reads like CPU Eigen with explicit upload/download for dense
-`operator*` dispatches to cuBLAS GEMM, `.llt().solve()` dispatches to
+operations, and an almost identical API for sparse solvers. Unsupported
-cuSOLVER potrf + potrs. Unsupported expressions are compile errors.
+expressions are compile errors.
 **Explicit over implicit.** Host-device transfers, stream management, and
 library handle lifetimes are visible in the API. There are no hidden
@@ -162,24 +173,94 @@ lu.compute(d_A);
 auto d_Y = lu.solve(d_B, GpuLU<double>::Transpose);  // A^T Y = B
 // QR solve (overdetermined least squares)
-GpuQR<double> qr(A);                // host matrix input
+GpuQR<double> qr;
-MatrixXd X = qr.solve(B);           // Q^H * B via ormqr, then trsm on R
+qr.compute(d_A);                     // factorize on device (async)
 auto d_X = qr.solve(d_B);           // Q^H * B via ormqr, then trsm on R
 MatrixXd X = d_X.toHost();
-// SVD
+// SVD (results downloaded on access)
-GpuSVD<double> svd(A, ComputeThinU | ComputeThinV);
+GpuSVD<double> svd;
-VectorXd S = svd.singularValues();
+svd.compute(d_A, ComputeThinU | ComputeThinV);
-MatrixXd U = svd.matrixU();
+VectorXd S = svd.singularValues();   // downloads to host
-MatrixXd VT = svd.matrixVT();
+MatrixXd U = svd.matrixU();          // downloads to host
-MatrixXd X = svd.solve(B);          // pseudoinverse solve
+MatrixXd VT = svd.matrixVT();        // V^T (matches cuSOLVER)
-// Self-adjoint eigenvalue decomposition
+// Self-adjoint eigenvalue decomposition (results downloaded on access)
-GpuSelfAdjointEigenSolver<double> es(A);
+GpuSelfAdjointEigenSolver<double> es;
-VectorXd eigenvals = es.eigenvalues();
+es.compute(d_A);
-MatrixXd eigenvecs = es.eigenvectors();
+VectorXd eigenvals = es.eigenvalues();    // downloads to host
 MatrixXd eigenvecs = es.eigenvectors();   // downloads to host
 ```
 The cached API keeps the factored matrix on device, avoiding redundant
-host-device transfers and re-factorizations.
+host-device transfers and re-factorizations. All solvers also accept host
 matrices directly as a convenience (e.g., `GpuLLT<double> llt(A)` or
 `qr.solve(B)`), which handles upload/download internally.
 ### Sparse direct solvers (cuDSS)
 Requires cuDSS (separate install, CUDA 12.0+). Define `EIGEN_CUDSS` before
 including `Eigen/GPU` and link with `-lcudss`.
 ```cpp
 SparseMatrix<double> A = ...;  // symmetric positive definite
 VectorXd b = ...;
 // Sparse Cholesky -- one-liner
 GpuSparseLLT<double> llt(A);
 VectorXd x = llt.solve(b);
 // Three-phase workflow for repeated solves with the same sparsity pattern
 GpuSparseLLT<double> llt;
 llt.analyzePattern(A);               // symbolic analysis (once)
 llt.factorize(A);                    // numeric factorization
 VectorXd x = llt.solve(b);
 llt.factorize(A_new_values);         // refactorize (reuses symbolic analysis)
 VectorXd x2 = llt.solve(b);
 // Sparse LDL^T (symmetric indefinite)
 GpuSparseLDLT<double> ldlt(A);
 VectorXd x = ldlt.solve(b);
 // Sparse LU (general non-symmetric)
 GpuSparseLU<double> lu(A);
 VectorXd x = lu.solve(b);
 ```
 ### FFT (cuFFT)
 ```cpp
 GpuFFT<float> fft;
 // 1D complex-to-complex
 VectorXcf X = fft.fwd(x);           // forward
 VectorXcf y = fft.inv(X);           // inverse (scaled by 1/n)
 // 1D real-to-complex / complex-to-real
 VectorXcf R = fft.fwd(r);           // returns n/2+1 complex (half-spectrum)
 VectorXf  s = fft.invReal(R, n);    // C2R inverse, caller specifies n
 // 2D complex-to-complex
 MatrixXcf B = fft.fwd2d(A);         // 2D forward
 MatrixXcf C = fft.inv2d(B);         // 2D inverse (scaled by 1/(rows*cols))
 // Plans are cached and reused across calls with the same size/type.
 ```
 ### Sparse matrix-vector multiply (cuSPARSE)
 ```cpp
 SparseMatrix<double> A = ...;
 VectorXd x = ...;
 GpuSparseContext<double> ctx;
 VectorXd y = ctx.multiply(A, x);            // y = A * x
 VectorXd z = ctx.multiplyT(A, x);           // z = A^T * x
 ctx.multiply(A, x, y, 2.0, 1.0);            // y = 2*A*x + y
 // Multiple RHS (SpMM)
 MatrixXd Y = ctx.multiplyMat(A, X);         // Y = A * X
 ```
 ### Precision control
@@ -219,7 +300,8 @@ skip the wait (CUDA guarantees in-order execution within a stream).
 ### Supported scalar types
-`float`, `double`, `std::complex<float>`, `std::complex<double>`.
+`float`, `double`, `std::complex<float>`, `std::complex<double>` (unless
 noted otherwise).
 ### Expression -> library call mapping
@@ -241,29 +323,41 @@ skip the wait (CUDA guarantees in-order execution within a stream).
 | `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
 | `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
-### `DeviceMatrix<Scalar>` API
+### `DeviceMatrix<Scalar>`
-| Method | Sync? | Description |
+Typed RAII wrapper for a dense column-major matrix in GPU device memory.
-|--------|-------|-------------|
+Always dense (leading dimension = rows). A vector is a `DeviceMatrix` with
-| `DeviceMatrix()` | -- | Empty (0x0) |
+one column.
-| `DeviceMatrix(rows, cols)` | -- | Allocate uninitialized |
+
-| `fromHost(matrix, stream)` | yes | Upload from Eigen matrix |
+```cpp
-| `fromHostAsync(ptr, rows, cols, outerStride, stream)` | no | Async upload (caller manages lifetime) |
+// Construction
-| `toHost(stream)` | yes | Synchronous download |
+DeviceMatrix<Scalar>()                                   // Empty (0x0)
-| `toHostAsync(stream)` | no | Returns `HostTransfer` future |
+DeviceMatrix<Scalar>(rows, cols)                         // Allocate uninitialized
-| `clone(stream)` | no | Device-to-device deep copy |
+
-| `resize(rows, cols)` | -- | Discard contents, reallocate |
+// Upload / download
-| `data()` | -- | Raw device pointer |
+static DeviceMatrix fromHost(matrix, stream=nullptr)           // -> DeviceMatrix (syncs)
-| `rows()`, `cols()` | -- | Dimensions |
+static DeviceMatrix fromHostAsync(ptr, rows, cols, outerStride, s)  // -> DeviceMatrix (no sync, caller manages ptr lifetime)
-| `sizeInBytes()` | -- | Total device allocation size in bytes |
+PlainMatrix        toHost(stream=nullptr)                      // -> host Matrix (syncs)
-| `empty()` | -- | True if 0x0 |
+HostTransfer       toHostAsync(stream=nullptr)                 // -> HostTransfer future (no sync)
-| `adjoint()` | -- | Adjoint view (GEMM ConjTrans) |
+DeviceMatrix       clone(stream=nullptr)                       // -> DeviceMatrix (D2D copy, async)
-| `transpose()` | -- | Transpose view (GEMM Trans) |
+
-| `llt()` / `llt<UpLo>()` | -- | Cholesky expression builder |
+// Dimensions and access
-| `lu()` | -- | LU expression builder |
+Index   rows()
-| `triangularView<UpLo>()` | -- | Triangular view (TRSM) |
+Index   cols()
-| `selfadjointView<UpLo>()` | -- | Self-adjoint view (SYMM, rankUpdate) |
+size_t  sizeInBytes()
-| `device(ctx)` | -- | Assignment proxy bound to context |
+bool    empty()
 Scalar* data()                                           // Raw device pointer
 void    resize(Index rows, Index cols)                   // Discard contents, reallocate
 // Expression builders (return lightweight views, evaluated on assignment)
 AdjointView       adjoint()                              // GEMM with ConjTrans
 TransposeView     transpose()                            // GEMM with Trans
 LltExpr            llt() / llt<UpLo>()                   // -> .solve(d_B) -> DeviceMatrix
 LuExpr             lu()                                  // -> .solve(d_B) -> DeviceMatrix
 TriangularView     triangularView<UpLo>()                // -> .solve(d_B) -> DeviceMatrix (TRSM)
 SelfAdjointView    selfadjointView<UpLo>()               // -> * d_B (SYMM), .rankUpdate(d_A) (SYRK)
 DeviceAssignment   device(GpuContext& ctx)                // Bind assignment to explicit stream
 ```
 ### `GpuContext`
@@ -280,92 +374,190 @@ cusolverDnHandle_t cusolverHandle()
 Non-copyable, non-movable (owns library handles).
-### `GpuLLT<Scalar, UpLo>` API
+### `GpuLLT<Scalar, UpLo>` -- Dense Cholesky (cuSOLVER)
-GPU dense Cholesky (LL^T) via cuSOLVER. Caches factor on device.
+Caches the Cholesky factor on device for repeated solves.
-| Method | Sync? | Description |
+```cpp
-|--------|-------|-------------|
+GpuLLT()                                                // Default construct, then call compute()
-| `GpuLLT(A)` | deferred | Construct and factorize from host matrix |
+GpuLLT(const EigenBase<D>& A)                           // Convenience: upload + factorize
 | `compute(host_matrix)` | deferred | Upload and factorize |
 | `compute(DeviceMatrix)` | deferred | D2D copy and factorize |
 | `compute(DeviceMatrix&&)` | deferred | Move-adopt and factorize (no copy) |
 | `solve(host_matrix)` | yes | Solve, return host matrix |
 | `solve(DeviceMatrix)` | no | Solve, return `DeviceMatrix` (async) |
 | `info()` | lazy | Syncs stream on first call, returns `Success` or `NumericalIssue` |
-### `GpuLU<Scalar>` API
+GpuLLT&            compute(const EigenBase<D>& A)       // Upload + factorize
 GpuLLT&            compute(const DeviceMatrix& d_A)     // D2D copy + factorize
 GpuLLT&            compute(DeviceMatrix&& d_A)          // Adopt + factorize (no copy)
-GPU dense partial-pivoting LU via cuSOLVER. Same pattern as `GpuLLT`, plus
+PlainMatrix        solve(const MatrixBase<D>& B)         // -> host Matrix (syncs)
-`TransposeMode` parameter on `solve()` (`NoTranspose`, `Transpose`,
+DeviceMatrix       solve(const DeviceMatrix& d_B)        // -> DeviceMatrix (async, stays on device)
 `ConjugateTranspose`).
-### `GpuQR<Scalar>` API
+ComputationInfo    info()                                // Lazy sync on first call: Success or NumericalIssue
 Index              rows() / cols()
 cudaStream_t       stream()
 ```
-GPU dense QR decomposition via cuSOLVER (`geqrf`). Solve uses `ormqr` (apply
+### `GpuLU<Scalar>` -- Dense LU (cuSOLVER)
 Q^H) + `trsm` (back-substitute on R) -- Q is never formed explicitly.
-| Method | Description |
+Same pattern as `GpuLLT`. Adds `TransposeMode` parameter on `solve()`.
 |--------|-------------|
 | `GpuQR()` | Default construct, then call `compute()` |
 | `GpuQR(A)` | Construct and factorize from host matrix |
 | `compute(A)` | Upload + factorize |
 | `compute(DeviceMatrix)` | D2D copy + factorize |
 | `solve(host_matrix)` | Solve, return host matrix (syncs) |
 | `solve(DeviceMatrix)` | Solve, return `DeviceMatrix` (async) |
 | `info()` | Lazy sync |
 | `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
-### `GpuSVD<Scalar>` API
+```cpp
 PlainMatrix        solve(const MatrixBase<D>& B, TransposeMode m = NoTranspose)  // -> host Matrix
 DeviceMatrix       solve(const DeviceMatrix& d_B, TransposeMode m = NoTranspose) // -> DeviceMatrix
 ```
-GPU dense SVD via cuSOLVER (`gesvd`). Supports thin, full, and values-only
+`TransposeMode`: `NoTranspose`, `Transpose`, `ConjugateTranspose`.
 modes via Eigen's `ComputeThinU | ComputeThinV`, `ComputeFullU | ComputeFullV`,
 or `0` (values only).
-| Method | Description |
+### `GpuQR<Scalar>` -- Dense QR (cuSOLVER)
 |--------|-------------|
 | `GpuSVD()` | Default construct, then call `compute()` |
 | `GpuSVD(A, options)` | Construct and compute (options default: `ComputeThinU \| ComputeThinV`) |
 | `compute(A, options)` | Compute from host matrix |
 | `compute(DeviceMatrix, options)` | Compute from device matrix |
 | `singularValues()` | Download singular values to host |
 | `matrixU()` | Download U to host (requires `ComputeThinU` or `ComputeFullU`) |
 | `matrixVT()` | Download V^T to host (requires `ComputeThinV` or `ComputeFullV`) |
 | `solve(B)` | Pseudoinverse solve (returns host matrix) |
 | `solve(B, k)` | Truncated solve (top k singular triplets) |
 | `solve(B, lambda)` | Tikhonov regularized solve |
 | `rank(threshold)` | Count singular values above threshold |
 | `info()` | Lazy sync |
 | `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
-Wide matrices (m < n) are handled by internally transposing via cuBLAS `geam`.
+QR factorization via `cusolverDnXgeqrf`. Solve uses ORMQR (apply Q^H) + TRSM
 (back-substitute on R) -- Q is never formed explicitly.
-### `GpuSelfAdjointEigenSolver<Scalar>` API
+```cpp
 GpuQR()                                                  // Default construct
 GpuQR(const EigenBase<D>& A)                             // Convenience: upload + factorize
-GPU symmetric/Hermitian eigenvalue decomposition via cuSOLVER (`syevd`).
+GpuQR&             compute(const EigenBase<D>& A)        // Upload + factorize
 GpuQR&             compute(const DeviceMatrix& d_A)      // D2D copy + factorize
-| Method | Description |
+PlainMatrix        solve(const MatrixBase<D>& B)         // -> host Matrix (syncs)
-|--------|-------------|
+DeviceMatrix       solve(const DeviceMatrix& d_B)        // -> DeviceMatrix (async)
 | `GpuSelfAdjointEigenSolver()` | Default construct, then call `compute()` |
 | `GpuSelfAdjointEigenSolver(A, mode)` | Construct and compute (mode default: `ComputeEigenvectors`) |
 | `compute(A, mode)` | Compute from host matrix |
 | `compute(DeviceMatrix, mode)` | Compute from device matrix |
 | `eigenvalues()` | Download eigenvalues to host (ascending order) |
 | `eigenvectors()` | Download eigenvectors to host (columns) |
 | `info()` | Lazy sync |
 | `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
-`ComputeMode`: `GpuSelfAdjointEigenSolver::EigenvaluesOnly` or
+ComputationInfo    info()                                // Lazy sync
-`GpuSelfAdjointEigenSolver::ComputeEigenvectors`.
+Index              rows() / cols()
 cudaStream_t       stream()
 ```
-### `HostTransfer<Scalar>` API
+### `GpuSVD<Scalar>` -- Dense SVD (cuSOLVER)
-Future for async device-to-host transfer.
+SVD via `cusolverDnXgesvd`. Supports `ComputeThinU | ComputeThinV`,
 `ComputeFullU | ComputeFullV`, or `0` (values only). Wide matrices (m < n)
 handled by internal transpose.
-| Method | Description |
+```cpp
-|--------|-------------|
+GpuSVD()                                                 // Default construct, then call compute()
-| `get()` | Block until transfer completes, return host matrix reference. Idempotent. |
+GpuSVD(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV)  // Convenience
-| `ready()` | Non-blocking poll |
+
 GpuSVD&            compute(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV)
 GpuSVD&            compute(const DeviceMatrix& d_A, unsigned options = ComputeThinU | ComputeThinV)
 RealVector         singularValues()                      // -> host vector (syncs, downloads)
 PlainMatrix        matrixU()                             // -> host Matrix (syncs, downloads)
 PlainMatrix        matrixVT()                            // -> host Matrix (syncs, downloads V^T)
 PlainMatrix        solve(const MatrixBase<D>& B)         // -> host Matrix (pseudoinverse)
 PlainMatrix        solve(const MatrixBase<D>& B, Index k)       // Truncated (top k triplets)
 PlainMatrix        solve(const MatrixBase<D>& B, RealScalar l)  // Tikhonov regularized
 Index              rank(RealScalar threshold = -1)
 ComputationInfo    info()                                // Lazy sync
 Index              rows() / cols()
 cudaStream_t       stream()
 ```
 **Note:** `singularValues()`, `matrixU()`, and `matrixVT()` download to host
 on each call. Device-side accessors returning `DeviceMatrix` are planned but
 not yet implemented.
 ### `GpuSelfAdjointEigenSolver<Scalar>` -- Eigendecomposition (cuSOLVER)
 Symmetric/Hermitian eigenvalue decomposition via `cusolverDnXsyevd`.
 `ComputeMode` enum: `EigenvaluesOnly`, `ComputeEigenvectors`.
 ```cpp
 GpuSelfAdjointEigenSolver()                              // Default construct, then call compute()
 GpuSelfAdjointEigenSolver(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors)  // Convenience
 GpuSelfAdjointEigenSolver& compute(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors)
 GpuSelfAdjointEigenSolver& compute(const DeviceMatrix& d_A, ComputeMode mode = ComputeEigenvectors)
 RealVector         eigenvalues()                         // -> host vector (syncs, downloads, ascending order)
 PlainMatrix        eigenvectors()                        // -> host Matrix (syncs, downloads, columns)
 ComputationInfo    info()                                // Lazy sync
 Index              rows() / cols()
 cudaStream_t       stream()
 ```
 **Note:** `eigenvalues()` and `eigenvectors()` download to host on each call.
 Device-side accessors returning `DeviceMatrix` are planned but not yet
 implemented.
 ### `HostTransfer<Scalar>`
 Future for async device-to-host transfer. Returned by
 `DeviceMatrix::toHostAsync()`.
 ```cpp
 PlainMatrix&       get()                                 // Block until complete, return host Matrix ref. Idempotent.
 bool               ready()                               // Non-blocking poll
 ```
 ### `GpuSparseLLT<Scalar, UpLo>` -- Sparse Cholesky (cuDSS)
 Requires cuDSS (CUDA 12.0+, `#define EIGEN_CUDSS`). Three-phase workflow
 with symbolic reuse. Accepts `SparseMatrix<Scalar, ColMajor, int>` (CSC).
 ```cpp
 GpuSparseLLT()                                           // Default construct
 GpuSparseLLT(const SparseMatrixBase<D>& A)               // Analyze + factorize
 GpuSparseLLT&      analyzePattern(const SparseMatrixBase<D>& A)  // Symbolic analysis (reusable)
 GpuSparseLLT&      factorize(const SparseMatrixBase<D>& A)       // Numeric factorization
 GpuSparseLLT&      compute(const SparseMatrixBase<D>& A)         // analyzePattern + factorize
 void               setOrdering(GpuSparseOrdering ord)             // AMD (default), METIS, or RCM
 DenseMatrix        solve(const MatrixBase<D>& B)         // -> host Matrix (syncs)
 ComputationInfo    info()                                // Lazy sync
 Index              rows() / cols()
 cudaStream_t       stream()
 ```
 ### `GpuSparseLDLT<Scalar, UpLo>` -- Sparse LDL^T (cuDSS)
 Symmetric indefinite. Same API as `GpuSparseLLT`.
 ### `GpuSparseLU<Scalar>` -- Sparse LU (cuDSS)
 General non-symmetric. Same API as `GpuSparseLLT` (without `UpLo`).
 ### `GpuFFT<Scalar>` -- FFT (cuFFT)
 Plans cached by (size, type) and reused. Inverse transforms scaled so
 `inv(fwd(x)) == x`. Supported scalars: `float`, `double`.
 ```cpp
 // 1D transforms (host vectors in and out)
 ComplexVector      fwd(const MatrixBase<D>& x)           // C2C forward (complex input)
 ComplexVector      fwd(const MatrixBase<D>& x)           // R2C forward (real input, returns n/2+1)
 ComplexVector      inv(const MatrixBase<D>& X)           // C2C inverse, scaled by 1/n
 RealVector         invReal(const MatrixBase<D>& X, Index n)  // C2R inverse, scaled by 1/n
 // 2D transforms (host matrices in and out)
 ComplexMatrix      fwd2d(const MatrixBase<D>& A)         // 2D C2C forward
 ComplexMatrix      inv2d(const MatrixBase<D>& A)         // 2D C2C inverse, scaled by 1/(rows*cols)
 cudaStream_t       stream()
 ```
 All FFT methods accept host data and return host data. Upload/download is
 handled internally. The C2C and R2C overloads of `fwd()` are distinguished by
 the input scalar type (complex vs real).
 ### `GpuSparseContext<Scalar>` -- SpMV/SpMM (cuSPARSE)
 Accepts `SparseMatrix<Scalar, ColMajor>`. All methods accept host data and
 return host data.
 ```cpp
 GpuSparseContext()                                       // Creates own stream + cuSPARSE handle
 DenseVector        multiply(A, x)                                       // y = A * x
 void               multiply(A, x, y, alpha=1, beta=0,                   // y = alpha*op(A)*x + beta*y
                     op=CUSPARSE_OPERATION_NON_TRANSPOSE)
 DenseVector        multiplyT(A, x)                                      // y = A^T * x
 DenseMatrix        multiplyMat(A, X)                                    // Y = A * X (SpMM)
 cudaStream_t       stream()
 ```
 ### Aliasing
@@ -393,6 +585,15 @@ The caller must ensure operands don't alias the destination for GEMM and TRSM
 | `GpuQR.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense QR decomposition |
 | `GpuSVD.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense SVD decomposition |
 | `GpuEigenSolver.h` | `CuSolverSupport.h` | Self-adjoint eigenvalue decomposition |
 | `CuFftSupport.h` | `GpuSupport.h`, `<cufft.h>` | cuFFT error macro, type-dispatch wrappers |
 | `GpuFFT.h` | `CuFftSupport.h`, `CuBlasSupport.h` | 1D/2D FFT with plan caching |
 | `CuSparseSupport.h` | `GpuSupport.h`, `<cusparse.h>` | cuSPARSE error macro |
 | `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE |
 | `CuDssSupport.h` | `GpuSupport.h`, `<cudss.h>` | cuDSS error macro, type traits (optional) |
 | `GpuSparseSolverBase.h` | `CuDssSupport.h` | CRTP base for sparse solvers (optional) |
 | `GpuSparseLLT.h` | `GpuSparseSolverBase.h` | Sparse Cholesky via cuDSS (optional) |
 | `GpuSparseLDLT.h` | `GpuSparseSolverBase.h` | Sparse LDL^T via cuDSS (optional) |
 | `GpuSparseLU.h` | `GpuSparseSolverBase.h` | Sparse LU via cuDSS (optional) |
 ## Building and testing
@@ -404,6 +605,32 @@ cmake -G Ninja -B build -S . \
  -DEIGEN_TEST_CUSOLVER=ON
 cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu \
-  gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen gpu_device_matrix
+  gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen \
-ctest --test-dir build -R "gpu_cublas|gpu_cusolver|gpu_device" --output-on-failure
+  gpu_device_matrix gpu_cufft gpu_cusparse_spmv
 ctest --test-dir build -R "gpu_" --output-on-failure
 # Sparse solvers (cuDSS -- separate install required)
 cmake -G Ninja -B build -S . \
  -DEIGEN_TEST_CUDA=ON \
  -DEIGEN_CUDA_COMPUTE_ARCH="70" \
  -DEIGEN_TEST_CUDSS=ON
 cmake --build build --target gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu
 ctest --test-dir build -R gpu_cudss --output-on-failure
 ```
 ## Future work
 - **Device-side accessors for decomposition results.** `GpuSVD`,
  `GpuSelfAdjointEigenSolver`, and `GpuQR` currently download decomposition
  results to host on access (e.g., `svd.matrixU()` returns a host `MatrixXd`).
  Device-side accessors returning `DeviceMatrix` views of the internal buffers
  would allow chaining GPU operations (e.g., `svd.deviceU() * d_A`) without
  round-tripping through host memory.
 - **Device-resident sparse matrix-vector products.** `GpuSparseContext`
  currently operates on host vectors and matrices, uploading and downloading
  on each call. The key missing piece is a `DeviceSparseView` that holds a
  sparse matrix on device and supports operator syntax (`d_y = d_A * d_x`)
  with `DeviceMatrix` operands -- keeping the entire SpMV/SpMM pipeline on
  device. This is essential for iterative solvers and any workflow that chains
  sparse and dense operations without returning to the host.
--- a/benchmarks/GPU/CMakeLists.txt
+++ b/benchmarks/GPU/CMakeLists.txt
@@ -51,3 +51,7 @@ eigen_add_gpu_benchmark(bench_gpu_chaining_float bench_gpu_chaining.cpp DEFINITI
 # Batching benchmarks: multi-stream concurrency for many small systems.
 eigen_add_gpu_benchmark(bench_gpu_batching bench_gpu_batching.cpp)
 eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITIONS SCALAR=float)
 # FFT benchmarks: 1D/2D C2C, R2C, C2R throughput and plan reuse.
 eigen_add_gpu_benchmark(bench_gpu_fft bench_gpu_fft.cpp LIBRARIES CUDA::cufft)
 eigen_add_gpu_benchmark(bench_gpu_fft_double bench_gpu_fft.cpp LIBRARIES CUDA::cufft DEFINITIONS SCALAR=double)
--- a/benchmarks/GPU/bench_gpu_fft.cpp
+++ b/benchmarks/GPU/bench_gpu_fft.cpp
@@ -0,0 +1,185 @@
 // GPU FFT benchmarks: GpuFFT 1D and 2D throughput.
 //
 // Measures forward and inverse FFT performance across a range of sizes,
 // including plan-amortized (reuse) and cold-start (new plan) scenarios.
 //
 // Usage:
 //   cmake --build build-bench-gpu --target bench_gpu_fft
 //   ./build-bench-gpu/bench_gpu_fft
 //
 // Profiling:
 //   nsys profile --trace=cuda ./build-bench-gpu/bench_gpu_fft
 #include <benchmark/benchmark.h>
 #include <Eigen/GPU>
 using namespace Eigen;
 #ifndef SCALAR
 #define SCALAR float
 #endif
 using Scalar = SCALAR;
 using Complex = std::complex<Scalar>;
 using CVec = Matrix<Complex, Dynamic, 1>;
 using RVec = Matrix<Scalar, Dynamic, 1>;
 using CMat = Matrix<Complex, Dynamic, Dynamic>;
 // CUDA warm-up: ensure the GPU is initialized before timing.
 static void cuda_warmup() {
  static bool done = false;
  if (!done) {
    void* p;
    cudaMalloc(&p, 1);
    cudaFree(p);
    done = true;
  }
 }
 // --------------------------------------------------------------------------
 // 1D C2C Forward
 // --------------------------------------------------------------------------
 static void BM_GpuFFT_1D_C2C_Fwd(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  CVec x = CVec::Random(n);
  GpuFFT<Scalar> fft;
  // Warm up plan.
  CVec tmp = fft.fwd(x);
  for (auto _ : state) {
    benchmark::DoNotOptimize(fft.fwd(x));
  }
  state.SetItemsProcessed(state.iterations() * n);
  state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);  // read + write
 }
 BENCHMARK(BM_GpuFFT_1D_C2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
 // --------------------------------------------------------------------------
 // 1D C2C Inverse
 // --------------------------------------------------------------------------
 static void BM_GpuFFT_1D_C2C_Inv(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  CVec x = CVec::Random(n);
  GpuFFT<Scalar> fft;
  CVec X = fft.fwd(x);
  for (auto _ : state) {
    benchmark::DoNotOptimize(fft.inv(X));
  }
  state.SetItemsProcessed(state.iterations() * n);
  state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);
 }
 BENCHMARK(BM_GpuFFT_1D_C2C_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
 // --------------------------------------------------------------------------
 // 1D R2C Forward
 // --------------------------------------------------------------------------
 static void BM_GpuFFT_1D_R2C_Fwd(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  RVec r = RVec::Random(n);
  GpuFFT<Scalar> fft;
  // Warm up plan.
  CVec tmp = fft.fwd(r);
  for (auto _ : state) {
    benchmark::DoNotOptimize(fft.fwd(r));
  }
  state.SetItemsProcessed(state.iterations() * n);
  state.SetBytesProcessed(state.iterations() * (n * sizeof(Scalar) + (n / 2 + 1) * sizeof(Complex)));
 }
 BENCHMARK(BM_GpuFFT_1D_R2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
 // --------------------------------------------------------------------------
 // 1D C2R Inverse
 // --------------------------------------------------------------------------
 static void BM_GpuFFT_1D_C2R_Inv(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  RVec r = RVec::Random(n);
  GpuFFT<Scalar> fft;
  CVec R = fft.fwd(r);
  for (auto _ : state) {
    benchmark::DoNotOptimize(fft.invReal(R, n));
  }
  state.SetItemsProcessed(state.iterations() * n);
  state.SetBytesProcessed(state.iterations() * ((n / 2 + 1) * sizeof(Complex) + n * sizeof(Scalar)));
 }
 BENCHMARK(BM_GpuFFT_1D_C2R_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
 // --------------------------------------------------------------------------
 // 2D C2C Forward
 // --------------------------------------------------------------------------
 static void BM_GpuFFT_2D_C2C_Fwd(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);  // square n x n
  CMat A = CMat::Random(n, n);
  GpuFFT<Scalar> fft;
  // Warm up plan.
  CMat tmp = fft.fwd2d(A);
  for (auto _ : state) {
    benchmark::DoNotOptimize(fft.fwd2d(A));
  }
  state.SetItemsProcessed(state.iterations() * n * n);
  state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 2);
 }
 BENCHMARK(BM_GpuFFT_2D_C2C_Fwd)->RangeMultiplier(2)->Range(64, 4096);
 // --------------------------------------------------------------------------
 // 2D C2C Roundtrip (fwd + inv)
 // --------------------------------------------------------------------------
 static void BM_GpuFFT_2D_C2C_Roundtrip(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  CMat A = CMat::Random(n, n);
  GpuFFT<Scalar> fft;
  // Warm up plans.
  CMat tmp = fft.inv2d(fft.fwd2d(A));
  for (auto _ : state) {
    CMat B = fft.fwd2d(A);
    benchmark::DoNotOptimize(fft.inv2d(B));
  }
  state.SetItemsProcessed(state.iterations() * n * n * 2);  // fwd + inv
  state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 4);
 }
 BENCHMARK(BM_GpuFFT_2D_C2C_Roundtrip)->RangeMultiplier(2)->Range(64, 4096);
 // --------------------------------------------------------------------------
 // 1D Cold start (includes plan creation)
 // --------------------------------------------------------------------------
 static void BM_GpuFFT_1D_ColdStart(benchmark::State& state) {
  cuda_warmup();
  const Index n = state.range(0);
  CVec x = CVec::Random(n);
  for (auto _ : state) {
    GpuFFT<Scalar> fft;  // new object = new plans
    benchmark::DoNotOptimize(fft.fwd(x));
  }
  state.SetItemsProcessed(state.iterations() * n);
 }
 BENCHMARK(BM_GpuFFT_1D_ColdStart)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -547,11 +547,86 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
    set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
  endif()
  # cuFFT test (cuFFT is part of the CUDA toolkit — no separate option needed).
  if(TARGET CUDA::cufft)
    unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
    add_executable(gpu_cufft gpu_cufft.cpp)
    target_include_directories(gpu_cufft PRIVATE
      "${CUDA_TOOLKIT_ROOT_DIR}/include"
      "${CMAKE_CURRENT_BINARY_DIR}")
    target_link_libraries(gpu_cufft
      Eigen3::Eigen CUDA::cudart CUDA::cufft CUDA::cublas)
    target_compile_definitions(gpu_cufft PRIVATE
      EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
      EIGEN_TEST_PART_ALL=1)
    add_test(NAME gpu_cufft COMMAND gpu_cufft)
    add_dependencies(buildtests gpu_cufft)
    add_dependencies(buildtests_gpu gpu_cufft)
    set_property(TEST gpu_cufft APPEND PROPERTY LABELS "Official;gpu")
    set_property(TEST gpu_cufft PROPERTY SKIP_RETURN_CODE 77)
    set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
  endif()
  # cuSPARSE SpMV test (cuSPARSE is part of the CUDA toolkit).
  if(TARGET CUDA::cusparse)
    unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
    add_executable(gpu_cusparse_spmv gpu_cusparse_spmv.cpp)
    target_include_directories(gpu_cusparse_spmv PRIVATE
      "${CUDA_TOOLKIT_ROOT_DIR}/include"
      "${CMAKE_CURRENT_BINARY_DIR}")
    target_link_libraries(gpu_cusparse_spmv
      Eigen3::Eigen CUDA::cudart CUDA::cusparse)
    target_compile_definitions(gpu_cusparse_spmv PRIVATE
      EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
      EIGEN_TEST_PART_ALL=1)
    add_test(NAME gpu_cusparse_spmv COMMAND gpu_cusparse_spmv)
    add_dependencies(buildtests gpu_cusparse_spmv)
    add_dependencies(buildtests_gpu gpu_cusparse_spmv)
    set_property(TEST gpu_cusparse_spmv APPEND PROPERTY LABELS "Official;gpu")
    set_property(TEST gpu_cusparse_spmv PROPERTY SKIP_RETURN_CODE 77)
    set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
  endif()
  option(EIGEN_TEST_CUSPARSE "Test cuSPARSE integration" OFF)
  if(EIGEN_TEST_CUSPARSE AND TARGET CUDA::cusparse)
    ei_add_test(gpu_cusparse "" "CUDA::cusparse")
  endif()
  # cuDSS sparse direct solver tests.
  # cuDSS is distributed separately from the CUDA Toolkit.
  option(EIGEN_TEST_CUDSS "Test cuDSS sparse solver integration" OFF)
  if(EIGEN_TEST_CUDSS)
    find_path(CUDSS_INCLUDE_DIR cudss.h
      HINTS ${CUDSS_DIR}/include ${CUDA_TOOLKIT_ROOT_DIR}/include /usr/include)
    find_library(CUDSS_LIBRARY cudss
      HINTS ${CUDSS_DIR}/lib ${CUDSS_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib/x86_64-linux-gnu)
    if(CUDSS_INCLUDE_DIR AND CUDSS_LIBRARY)
      message(STATUS "cuDSS found: ${CUDSS_LIBRARY}")
      unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
      foreach(_cudss_test IN ITEMS gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu)
        add_executable(${_cudss_test} ${_cudss_test}.cpp)
        target_include_directories(${_cudss_test} PRIVATE
          "${CUDA_TOOLKIT_ROOT_DIR}/include"
          "${CUDSS_INCLUDE_DIR}"
          "${CMAKE_CURRENT_BINARY_DIR}")
        target_link_libraries(${_cudss_test}
          Eigen3::Eigen CUDA::cudart CUDA::cusolver CUDA::cublas ${CUDSS_LIBRARY})
        target_compile_definitions(${_cudss_test} PRIVATE
          EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
          EIGEN_TEST_PART_ALL=1
          EIGEN_CUDSS=1)
        add_test(NAME ${_cudss_test} COMMAND "${_cudss_test}")
        add_dependencies(buildtests ${_cudss_test})
        add_dependencies(buildtests_gpu ${_cudss_test})
        set_property(TEST ${_cudss_test} APPEND PROPERTY LABELS "Official;gpu")
        set_property(TEST ${_cudss_test} PROPERTY SKIP_RETURN_CODE 77)
      endforeach()
      set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
    else()
      message(WARNING "EIGEN_TEST_CUDSS=ON but cuDSS not found. Set CUDSS_DIR.")
    endif()
  endif()
  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
--- a/test/gpu_cudss_ldlt.cpp
+++ b/test/gpu_cudss_ldlt.cpp
@@ -0,0 +1,154 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // Tests for GpuSparseLDLT: GPU sparse LDL^T via cuDSS.
 #define EIGEN_USE_GPU
 #include "main.h"
 #include <Eigen/Sparse>
 #include <Eigen/GPU>
 using namespace Eigen;
 // ---- Helper: build a random sparse symmetric indefinite matrix ---------------
 template <typename Scalar>
 SparseMatrix<Scalar, ColMajor, int> make_symmetric_indefinite(Index n, double density = 0.1) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  // Build a random sparse matrix and symmetrize it.
  // The diagonal has mixed signs to ensure indefiniteness.
  SpMat R(n, n);
  R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
  for (Index j = 0; j < n; ++j) {
    for (Index i = 0; i < n; ++i) {
      if (i == j || (std::rand() / double(RAND_MAX)) < density) {
        R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
      }
    }
  }
  R.makeCompressed();
  // A = R + R^H (symmetric), then add diagonal with alternating signs for indefiniteness.
  SpMat A = R + SparseMatrix<Scalar, ColMajor, int>(R.adjoint());
  for (Index i = 0; i < n; ++i) {
    Scalar diag_val = Scalar((i % 2 == 0) ? n : -n);
    A.coeffRef(i, i) += diag_val;
  }
  A.makeCompressed();
  return A;
 }
 // ---- Solve and check residual -----------------------------------------------
 template <typename Scalar>
 void test_solve(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_symmetric_indefinite<Scalar>(n);
  Vec b = Vec::Random(n);
  GpuSparseLDLT<Scalar> ldlt(A);
  VERIFY_IS_EQUAL(ldlt.info(), Success);
  Vec x = ldlt.solve(b);
  VERIFY_IS_EQUAL(x.rows(), n);
  Vec r = A * x - b;
  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY(r.norm() / b.norm() < tol);
 }
 // ---- Multiple RHS -----------------------------------------------------------
 template <typename Scalar>
 void test_multiple_rhs(Index n, Index nrhs) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_symmetric_indefinite<Scalar>(n);
  Mat B = Mat::Random(n, nrhs);
  GpuSparseLDLT<Scalar> ldlt(A);
  VERIFY_IS_EQUAL(ldlt.info(), Success);
  Mat X = ldlt.solve(B);
  VERIFY_IS_EQUAL(X.rows(), n);
  VERIFY_IS_EQUAL(X.cols(), nrhs);
  Mat R = A * X - B;
  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY(R.norm() / B.norm() < tol);
 }
 // ---- Refactorize ------------------------------------------------------------
 template <typename Scalar>
 void test_refactorize(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_symmetric_indefinite<Scalar>(n);
  Vec b = Vec::Random(n);
  GpuSparseLDLT<Scalar> ldlt;
  ldlt.analyzePattern(A);
  VERIFY_IS_EQUAL(ldlt.info(), Success);
  ldlt.factorize(A);
  VERIFY_IS_EQUAL(ldlt.info(), Success);
  Vec x1 = ldlt.solve(b);
  // Modify values, keep pattern.
  SpMat A2 = A;
  for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
  ldlt.factorize(A2);
  VERIFY_IS_EQUAL(ldlt.info(), Success);
  Vec x2 = ldlt.solve(b);
  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY((A * x1 - b).norm() / b.norm() < tol);
  VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
  VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
 }
 // ---- Empty ------------------------------------------------------------------
 void test_empty() {
  using SpMat = SparseMatrix<double, ColMajor, int>;
  SpMat A(0, 0);
  A.makeCompressed();
  GpuSparseLDLT<double> ldlt(A);
  VERIFY_IS_EQUAL(ldlt.info(), Success);
  VERIFY_IS_EQUAL(ldlt.rows(), 0);
  VERIFY_IS_EQUAL(ldlt.cols(), 0);
 }
 // ---- Per-scalar driver ------------------------------------------------------
 template <typename Scalar>
 void test_scalar() {
  CALL_SUBTEST(test_solve<Scalar>(64));
  CALL_SUBTEST(test_solve<Scalar>(256));
  CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
  CALL_SUBTEST(test_refactorize<Scalar>(64));
 }
 EIGEN_DECLARE_TEST(gpu_cudss_ldlt) {
  CALL_SUBTEST(test_scalar<float>());
  CALL_SUBTEST(test_scalar<double>());
  CALL_SUBTEST(test_scalar<std::complex<float>>());
  CALL_SUBTEST(test_scalar<std::complex<double>>());
  CALL_SUBTEST(test_empty());
 }
--- a/test/gpu_cudss_llt.cpp
+++ b/test/gpu_cudss_llt.cpp
@@ -0,0 +1,202 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // Tests for GpuSparseLLT: GPU sparse Cholesky via cuDSS.
 #define EIGEN_USE_GPU
 #include "main.h"
 #include <Eigen/Sparse>
 #include <Eigen/GPU>
 using namespace Eigen;
 // ---- Helper: build a random sparse SPD matrix -------------------------------
 template <typename Scalar>
 SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  // Uses the global std::rand state seeded by the test framework (g_seed).
  SpMat R(n, n);
  R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
  for (Index j = 0; j < n; ++j) {
    for (Index i = 0; i < n; ++i) {
      if (i == j || (std::rand() / double(RAND_MAX)) < density) {
        R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
      }
    }
  }
  R.makeCompressed();
  // A = R^H * R + n * I  (guaranteed SPD).
  SpMat A = R.adjoint() * R;
  for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
  A.makeCompressed();
  return A;
 }
 // ---- Solve and check residual -----------------------------------------------
 template <typename Scalar>
 void test_solve(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_spd<Scalar>(n);
  Vec b = Vec::Random(n);
  GpuSparseLLT<Scalar> llt(A);
  VERIFY_IS_EQUAL(llt.info(), Success);
  Vec x = llt.solve(b);
  VERIFY_IS_EQUAL(x.rows(), n);
  // Check residual: ||Ax - b|| / ||b||.
  Vec r = A * x - b;
  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY(r.norm() / b.norm() < tol);
 }
 // ---- Compare with CPU SimplicialLLT -----------------------------------------
 template <typename Scalar>
 void test_vs_cpu(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_spd<Scalar>(n);
  Vec b = Vec::Random(n);
  GpuSparseLLT<Scalar> gpu_llt(A);
  VERIFY_IS_EQUAL(gpu_llt.info(), Success);
  Vec x_gpu = gpu_llt.solve(b);
  SimplicialLLT<SpMat> cpu_llt(A);
  VERIFY_IS_EQUAL(cpu_llt.info(), Success);
  Vec x_cpu = cpu_llt.solve(b);
  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY((x_gpu - x_cpu).norm() / x_cpu.norm() < tol);
 }
 // ---- Multiple RHS -----------------------------------------------------------
 template <typename Scalar>
 void test_multiple_rhs(Index n, Index nrhs) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_spd<Scalar>(n);
  Mat B = Mat::Random(n, nrhs);
  GpuSparseLLT<Scalar> llt(A);
  VERIFY_IS_EQUAL(llt.info(), Success);
  Mat X = llt.solve(B);
  VERIFY_IS_EQUAL(X.rows(), n);
  VERIFY_IS_EQUAL(X.cols(), nrhs);
  Mat R = A * X - B;
  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY(R.norm() / B.norm() < tol);
 }
 // ---- Separate analyze + factorize (refactorization) -------------------------
 template <typename Scalar>
 void test_refactorize(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_spd<Scalar>(n);
  Vec b = Vec::Random(n);
  GpuSparseLLT<Scalar> llt;
  llt.analyzePattern(A);
  VERIFY_IS_EQUAL(llt.info(), Success);
  // First factorize + solve.
  llt.factorize(A);
  VERIFY_IS_EQUAL(llt.info(), Success);
  Vec x1 = llt.solve(b);
  // Modify values (keep same pattern): scale diagonal.
  SpMat A2 = A;
  for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
  // Refactorize with same pattern.
  llt.factorize(A2);
  VERIFY_IS_EQUAL(llt.info(), Success);
  Vec x2 = llt.solve(b);
  // Both solutions should satisfy their respective systems.
  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY((A * x1 - b).norm() / b.norm() < tol);
  VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
  // Solutions should differ (A2 != A).
  VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
 }
 // ---- Empty matrix -----------------------------------------------------------
 void test_empty() {
  using SpMat = SparseMatrix<double, ColMajor, int>;
  SpMat A(0, 0);
  A.makeCompressed();
  GpuSparseLLT<double> llt(A);
  VERIFY_IS_EQUAL(llt.info(), Success);
  VERIFY_IS_EQUAL(llt.rows(), 0);
  VERIFY_IS_EQUAL(llt.cols(), 0);
 }
 // ---- Upper triangle ---------------------------------------------------------
 template <typename Scalar>
 void test_upper(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_spd<Scalar>(n);
  Vec b = Vec::Random(n);
  GpuSparseLLT<Scalar, Upper> llt(A);
  VERIFY_IS_EQUAL(llt.info(), Success);
  Vec x = llt.solve(b);
  Vec r = A * x - b;
  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY(r.norm() / b.norm() < tol);
 }
 // ---- Per-scalar driver ------------------------------------------------------
 template <typename Scalar>
 void test_scalar() {
  CALL_SUBTEST(test_solve<Scalar>(64));
  CALL_SUBTEST(test_solve<Scalar>(256));
  CALL_SUBTEST(test_vs_cpu<Scalar>(64));
  CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
  CALL_SUBTEST(test_refactorize<Scalar>(64));
  CALL_SUBTEST(test_upper<Scalar>(64));
 }
 EIGEN_DECLARE_TEST(gpu_cudss_llt) {
  CALL_SUBTEST(test_scalar<float>());
  CALL_SUBTEST(test_scalar<double>());
  CALL_SUBTEST(test_scalar<std::complex<float>>());
  CALL_SUBTEST(test_scalar<std::complex<double>>());
  CALL_SUBTEST(test_empty());
 }
--- a/test/gpu_cudss_lu.cpp
+++ b/test/gpu_cudss_lu.cpp
@@ -0,0 +1,147 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // Tests for GpuSparseLU: GPU sparse LU via cuDSS.
 #define EIGEN_USE_GPU
 #include "main.h"
 #include <Eigen/Sparse>
 #include <Eigen/GPU>
 using namespace Eigen;
 // ---- Helper: build a random sparse non-singular general matrix ---------------
 template <typename Scalar>
 SparseMatrix<Scalar, ColMajor, int> make_general(Index n, double density = 0.1) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat R(n, n);
  R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
  for (Index j = 0; j < n; ++j) {
    for (Index i = 0; i < n; ++i) {
      if (i == j || (std::rand() / double(RAND_MAX)) < density) {
        R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
      }
    }
  }
  // Add strong diagonal for non-singularity.
  for (Index i = 0; i < n; ++i) R.coeffRef(i, i) += Scalar(RealScalar(n));
  R.makeCompressed();
  return R;
 }
 // ---- Solve and check residual -----------------------------------------------
 template <typename Scalar>
 void test_solve(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_general<Scalar>(n);
  Vec b = Vec::Random(n);
  GpuSparseLU<Scalar> lu(A);
  VERIFY_IS_EQUAL(lu.info(), Success);
  Vec x = lu.solve(b);
  VERIFY_IS_EQUAL(x.rows(), n);
  Vec r = A * x - b;
  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY(r.norm() / b.norm() < tol);
 }
 // ---- Multiple RHS -----------------------------------------------------------
 template <typename Scalar>
 void test_multiple_rhs(Index n, Index nrhs) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_general<Scalar>(n);
  Mat B = Mat::Random(n, nrhs);
  GpuSparseLU<Scalar> lu(A);
  VERIFY_IS_EQUAL(lu.info(), Success);
  Mat X = lu.solve(B);
  VERIFY_IS_EQUAL(X.rows(), n);
  VERIFY_IS_EQUAL(X.cols(), nrhs);
  Mat R = A * X - B;
  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY(R.norm() / B.norm() < tol);
 }
 // ---- Refactorize ------------------------------------------------------------
 template <typename Scalar>
 void test_refactorize(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_general<Scalar>(n);
  Vec b = Vec::Random(n);
  GpuSparseLU<Scalar> lu;
  lu.analyzePattern(A);
  VERIFY_IS_EQUAL(lu.info(), Success);
  lu.factorize(A);
  VERIFY_IS_EQUAL(lu.info(), Success);
  Vec x1 = lu.solve(b);
  // Modify values, keep pattern.
  SpMat A2 = A;
  for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
  lu.factorize(A2);
  VERIFY_IS_EQUAL(lu.info(), Success);
  Vec x2 = lu.solve(b);
  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY((A * x1 - b).norm() / b.norm() < tol);
  VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
  VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
 }
 // ---- Empty ------------------------------------------------------------------
 void test_empty() {
  using SpMat = SparseMatrix<double, ColMajor, int>;
  SpMat A(0, 0);
  A.makeCompressed();
  GpuSparseLU<double> lu(A);
  VERIFY_IS_EQUAL(lu.info(), Success);
  VERIFY_IS_EQUAL(lu.rows(), 0);
  VERIFY_IS_EQUAL(lu.cols(), 0);
 }
 // ---- Per-scalar driver ------------------------------------------------------
 template <typename Scalar>
 void test_scalar() {
  CALL_SUBTEST(test_solve<Scalar>(64));
  CALL_SUBTEST(test_solve<Scalar>(256));
  CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
  CALL_SUBTEST(test_refactorize<Scalar>(64));
 }
 EIGEN_DECLARE_TEST(gpu_cudss_lu) {
  CALL_SUBTEST(test_scalar<float>());
  CALL_SUBTEST(test_scalar<double>());
  CALL_SUBTEST(test_scalar<std::complex<float>>());
  CALL_SUBTEST(test_scalar<std::complex<double>>());
  CALL_SUBTEST(test_empty());
 }
--- a/test/gpu_cufft.cpp
+++ b/test/gpu_cufft.cpp
@@ -0,0 +1,186 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // Tests for GpuFFT: GPU FFT via cuFFT.
 #define EIGEN_USE_GPU
 #include "main.h"
 #include <Eigen/GPU>
 using namespace Eigen;
 // ---- 1D C2C roundtrip: inv(fwd(x)) ≈ x -------------------------------------
 template <typename Scalar>
 void test_c2c_roundtrip(Index n) {
  using Complex = std::complex<Scalar>;
  using Vec = Matrix<Complex, Dynamic, 1>;
  using RealScalar = Scalar;
  Vec x = Vec::Random(n);
  GpuFFT<Scalar> fft;
  Vec X = fft.fwd(x);
  VERIFY_IS_EQUAL(X.size(), n);
  Vec y = fft.inv(X);
  VERIFY_IS_EQUAL(y.size(), n);
  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY((y - x).norm() / x.norm() < tol);
 }
 // ---- 1D C2C known signal: FFT of constant = delta --------------------------
 template <typename Scalar>
 void test_c2c_constant() {
  using Complex = std::complex<Scalar>;
  using Vec = Matrix<Complex, Dynamic, 1>;
  using RealScalar = Scalar;
  const int n = 64;
  Vec x = Vec::Constant(n, Complex(3.0, 0.0));
  GpuFFT<Scalar> fft;
  Vec X = fft.fwd(x);
  // FFT of constant c: X[0] = c*n, X[k] = 0 for k > 0.
  RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(n);
  VERIFY(std::abs(X(0) - Complex(3.0 * n, 0.0)) < tol);
  for (int k = 1; k < n; ++k) {
    VERIFY(std::abs(X(k)) < tol);
  }
 }
 // ---- 1D R2C/C2R roundtrip: invReal(fwd(r), n) ≈ r --------------------------
 template <typename Scalar>
 void test_r2c_roundtrip(Index n) {
  using Complex = std::complex<Scalar>;
  using CVec = Matrix<Complex, Dynamic, 1>;
  using RVec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = Scalar;
  RVec r = RVec::Random(n);
  GpuFFT<Scalar> fft;
  CVec R = fft.fwd(r);
  // R2C returns n/2+1 complex values.
  VERIFY_IS_EQUAL(R.size(), n / 2 + 1);
  RVec s = fft.invReal(R, n);
  VERIFY_IS_EQUAL(s.size(), n);
  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY((s - r).norm() / r.norm() < tol);
 }
 // ---- 2D C2C roundtrip: inv2d(fwd2d(A)) ≈ A ---------------------------------
 template <typename Scalar>
 void test_2d_roundtrip(Index rows, Index cols) {
  using Complex = std::complex<Scalar>;
  using Mat = Matrix<Complex, Dynamic, Dynamic>;
  using RealScalar = Scalar;
  Mat A = Mat::Random(rows, cols);
  GpuFFT<Scalar> fft;
  Mat B = fft.fwd2d(A);
  VERIFY_IS_EQUAL(B.rows(), rows);
  VERIFY_IS_EQUAL(B.cols(), cols);
  Mat C = fft.inv2d(B);
  VERIFY_IS_EQUAL(C.rows(), rows);
  VERIFY_IS_EQUAL(C.cols(), cols);
  RealScalar tol = RealScalar(10) * RealScalar(rows * cols) * NumTraits<Scalar>::epsilon();
  VERIFY((C - A).norm() / A.norm() < tol);
 }
 // ---- 2D C2C known signal: constant matrix -----------------------------------
 template <typename Scalar>
 void test_2d_constant() {
  using Complex = std::complex<Scalar>;
  using Mat = Matrix<Complex, Dynamic, Dynamic>;
  using RealScalar = Scalar;
  const int rows = 16, cols = 32;
  Mat A = Mat::Constant(rows, cols, Complex(2.0, 0.0));
  GpuFFT<Scalar> fft;
  Mat B = fft.fwd2d(A);
  // 2D FFT of constant c: B(0,0) = c*rows*cols, all others = 0.
  RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(rows * cols);
  VERIFY(std::abs(B(0, 0) - Complex(2.0 * rows * cols, 0.0)) < tol);
  for (int j = 0; j < cols; ++j) {
    for (int i = 0; i < rows; ++i) {
      if (i == 0 && j == 0) continue;
      VERIFY(std::abs(B(i, j)) < tol);
    }
  }
 }
 // ---- Plan reuse: repeated calls should work ---------------------------------
 template <typename Scalar>
 void test_plan_reuse() {
  using Complex = std::complex<Scalar>;
  using Vec = Matrix<Complex, Dynamic, 1>;
  using RealScalar = Scalar;
  GpuFFT<Scalar> fft;
  for (int trial = 0; trial < 5; ++trial) {
    Vec x = Vec::Random(128);
    Vec X = fft.fwd(x);
    Vec y = fft.inv(X);
    RealScalar tol = RealScalar(10) * RealScalar(128) * NumTraits<Scalar>::epsilon();
    VERIFY((y - x).norm() / x.norm() < tol);
  }
 }
 // ---- Empty ------------------------------------------------------------------
 template <typename Scalar>
 void test_empty() {
  using Complex = std::complex<Scalar>;
  using Vec = Matrix<Complex, Dynamic, 1>;
  GpuFFT<Scalar> fft;
  Vec x(0);
  Vec X = fft.fwd(x);
  VERIFY_IS_EQUAL(X.size(), 0);
  Vec y = fft.inv(X);
  VERIFY_IS_EQUAL(y.size(), 0);
 }
 // ---- Per-scalar driver ------------------------------------------------------
 template <typename Scalar>
 void test_scalar() {
  CALL_SUBTEST(test_c2c_roundtrip<Scalar>(64));
  CALL_SUBTEST(test_c2c_roundtrip<Scalar>(256));
  CALL_SUBTEST(test_c2c_roundtrip<Scalar>(1000));  // non-power-of-2
  CALL_SUBTEST(test_c2c_constant<Scalar>());
  CALL_SUBTEST(test_r2c_roundtrip<Scalar>(64));
  CALL_SUBTEST(test_r2c_roundtrip<Scalar>(256));
  CALL_SUBTEST(test_2d_roundtrip<Scalar>(32, 32));
  CALL_SUBTEST(test_2d_roundtrip<Scalar>(16, 64));  // non-square
  CALL_SUBTEST(test_2d_constant<Scalar>());
  CALL_SUBTEST(test_plan_reuse<Scalar>());
  CALL_SUBTEST(test_empty<Scalar>());
 }
 EIGEN_DECLARE_TEST(gpu_cufft) {
  CALL_SUBTEST(test_scalar<float>());
  CALL_SUBTEST(test_scalar<double>());
 }
--- a/test/gpu_cusparse_spmv.cpp
+++ b/test/gpu_cusparse_spmv.cpp
@@ -0,0 +1,203 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // Tests for GpuSparseContext: GPU SpMV/SpMM via cuSPARSE.
 #define EIGEN_USE_GPU
 #include "main.h"
 #include <Eigen/Sparse>
 #include <Eigen/GPU>
 using namespace Eigen;
 // ---- Helper: build a random sparse matrix -----------------------------------
 template <typename Scalar>
 SparseMatrix<Scalar, ColMajor, int> make_sparse(Index rows, Index cols, double density = 0.1) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat R(rows, cols);
  R.reserve(VectorXi::Constant(cols, static_cast<int>(rows * density) + 1));
  for (Index j = 0; j < cols; ++j) {
    for (Index i = 0; i < rows; ++i) {
      if ((std::rand() / double(RAND_MAX)) < density) {
        R.insert(i, j) = Scalar(RealScalar(std::rand() / double(RAND_MAX) - 0.5));
      }
    }
  }
  R.makeCompressed();
  return R;
 }
 // ---- SpMV: y = A * x -------------------------------------------------------
 template <typename Scalar>
 void test_spmv(Index rows, Index cols) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_sparse<Scalar>(rows, cols);
  Vec x = Vec::Random(cols);
  GpuSparseContext<Scalar> ctx;
  Vec y_gpu = ctx.multiply(A, x);
  Vec y_cpu = A * x;
  RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
  VERIFY_IS_EQUAL(y_gpu.size(), rows);
  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
 }
 // ---- SpMV with alpha/beta: y = alpha*A*x + beta*y ---------------------------
 template <typename Scalar>
 void test_spmv_alpha_beta(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_sparse<Scalar>(n, n);
  Vec x = Vec::Random(n);
  Vec y_init = Vec::Random(n);
  Scalar alpha(2);
  Scalar beta(3);
  Vec y_cpu = alpha * (A * x) + beta * y_init;
  GpuSparseContext<Scalar> ctx;
  Vec y_gpu = y_init;
  ctx.multiply(A, x, y_gpu, alpha, beta);
  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
 }
 // ---- Transpose: y = A^T * x ------------------------------------------------
 template <typename Scalar>
 void test_spmv_transpose(Index rows, Index cols) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_sparse<Scalar>(rows, cols);
  Vec x = Vec::Random(rows);
  GpuSparseContext<Scalar> ctx;
  Vec y_gpu = ctx.multiplyT(A, x);
  Vec y_cpu = A.transpose() * x;
  RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
  VERIFY_IS_EQUAL(y_gpu.size(), cols);
  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
 }
 // ---- SpMM: Y = A * X (multiple RHS) ----------------------------------------
 template <typename Scalar>
 void test_spmm(Index rows, Index cols, Index nrhs) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  SpMat A = make_sparse<Scalar>(rows, cols);
  Mat X = Mat::Random(cols, nrhs);
  GpuSparseContext<Scalar> ctx;
  Mat Y_gpu = ctx.multiplyMat(A, X);
  Mat Y_cpu = A * X;
  RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
  VERIFY_IS_EQUAL(Y_gpu.rows(), rows);
  VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);
  VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);
 }
 // ---- Identity matrix: I * x = x --------------------------------------------
 template <typename Scalar>
 void test_identity(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  // Build sparse identity.
  SpMat eye(n, n);
  eye.setIdentity();
  eye.makeCompressed();
  Vec x = Vec::Random(n);
  GpuSparseContext<Scalar> ctx;
  Vec y = ctx.multiply(eye, x);
  RealScalar tol = NumTraits<Scalar>::epsilon();
  VERIFY((y - x).norm() < tol);
 }
 // ---- Context reuse ----------------------------------------------------------
 template <typename Scalar>
 void test_reuse(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;
  GpuSparseContext<Scalar> ctx;
  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  for (int trial = 0; trial < 3; ++trial) {
    SpMat A = make_sparse<Scalar>(n, n);
    Vec x = Vec::Random(n);
    Vec y_gpu = ctx.multiply(A, x);
    Vec y_cpu = A * x;
    VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
  }
 }
 // ---- Empty ------------------------------------------------------------------
 template <typename Scalar>
 void test_empty() {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  SpMat A(0, 0);
  A.makeCompressed();
  Vec x(0);
  GpuSparseContext<Scalar> ctx;
  Vec y = ctx.multiply(A, x);
  VERIFY_IS_EQUAL(y.size(), 0);
 }
 // ---- Per-scalar driver ------------------------------------------------------
 template <typename Scalar>
 void test_scalar() {
  CALL_SUBTEST(test_spmv<Scalar>(64, 64));
  CALL_SUBTEST(test_spmv<Scalar>(128, 64));  // non-square
  CALL_SUBTEST(test_spmv<Scalar>(64, 128));  // wide
  CALL_SUBTEST(test_spmv_alpha_beta<Scalar>(64));
  CALL_SUBTEST(test_spmv_transpose<Scalar>(128, 64));
  CALL_SUBTEST(test_spmm<Scalar>(64, 64, 4));
  CALL_SUBTEST(test_identity<Scalar>(64));
  CALL_SUBTEST(test_reuse<Scalar>(64));
  CALL_SUBTEST(test_empty<Scalar>());
 }
 EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {
  CALL_SUBTEST(test_scalar<float>());
  CALL_SUBTEST(test_scalar<double>());
  CALL_SUBTEST(test_scalar<std::complex<float>>());
  CALL_SUBTEST(test_scalar<std::complex<double>>());
 }