Eigen/GPU

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef EIGEN_GPU_MODULE_H
#define EIGEN_GPU_MODULE_H

#include "Core"

#include "src/Core/util/DisableStupidWarnings.h"

/** \defgroup GPU_Module GPU module
 *
 * GPU-accelerated solvers and operations using NVIDIA CUDA libraries
 * (cuSOLVER, cuBLAS, cuSPARSE, cuFFT, cuDSS).
 *
 * This module provides explicit GPU solver classes that coexist with Eigen's
 * CPU solvers. Unlike the LAPACKE dispatch (which replaces the CPU
 * implementation globally), GPU classes are separate types the user
 * instantiates by choice:
 *
 * \code
 * #define EIGEN_USE_GPU
 * #include <Eigen/GPU>
 *
 * // CPU path (unchanged)
 * Eigen::LLT<Eigen::MatrixXd> llt_cpu(A);
 *
 * // GPU path (explicit)
 * Eigen::GpuLLT<double> llt_gpu(A);   // L stays on device
 * auto X = llt_gpu.solve(B);          // only B transferred per solve
 * \endcode
 *
 * Requires CUDA 11.4+. See CLAUDE.md.
 */

#ifdef EIGEN_USE_GPU
// IWYU pragma: begin_exports
#include "src/GPU/DeviceScalar.h"
#include "src/GPU/DeviceMatrix.h"
#include "src/GPU/GpuContext.h"
#include "src/GPU/DeviceExpr.h"
#include "src/GPU/DeviceBlasExpr.h"
#include "src/GPU/DeviceSolverExpr.h"
#include "src/GPU/DeviceDispatch.h"
#include "src/GPU/GpuLLT.h"
#include "src/GPU/GpuLU.h"
#include "src/GPU/GpuQR.h"
#include "src/GPU/GpuSVD.h"
#include "src/GPU/GpuEigenSolver.h"
#include "src/GPU/CuFftSupport.h"
#include "src/GPU/GpuFFT.h"
#include "src/GPU/CuSparseSupport.h"
#ifdef EIGEN_SPARSECORE_MODULE_H
#include "src/GPU/GpuSparseContext.h"
#endif
#if defined(EIGEN_CUDSS) && defined(EIGEN_SPARSECORE_MODULE_H)
#include "src/GPU/CuDssSupport.h"
#include "src/GPU/GpuSparseSolverBase.h"
#include "src/GPU/GpuSparseLLT.h"
#include "src/GPU/GpuSparseLDLT.h"
#include "src/GPU/GpuSparseLU.h"
#endif
// IWYU pragma: end_exports
#endif

#include "src/Core/util/ReenableStupidWarnings.h"

#endif  // EIGEN_GPU_MODULE_H
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER) Add Eigen/GPU module: A standalone GPU library dispatch layer where DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls. CPU and GPU solvers coexist in the same binary with compatible syntax. Core infrastructure: - DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with async host transfer (fromHost/toHost) and CUDA event-based cross-stream synchronization. - GpuContext: Unified execution context owning a CUDA stream + cuBLAS handle + cuSOLVER handle. Thread-local default with explicit override via setThreadLocal(). Stream-borrowing constructor for integration. - DeviceBuffer: Typed RAII device allocation with move semantics. cuBLAS dispatch (expression syntax): - GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm) - TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm) - SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm) - SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk) cuSOLVER dispatch: - GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs) - GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs) - Solver chaining: auto x = d_A.llt().solve(d_B) - Solver expressions with .device(ctx) for explicit stream control. CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11, Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit. Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp, gpu_device_matrix.cpp, gpu_library_example.cu Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp, bench_gpu_batching.cpp 2026-04-09 16:15:39 -07:00			`// This file is part of Eigen, a lightweight C++ template library`
			`// for linear algebra.`
			`//`
			`// This Source Code Form is subject to the terms of the Mozilla`
			`// Public License v. 2.0. If a copy of the MPL was not distributed`
			`// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.`

			`#ifndef EIGEN_GPU_MODULE_H`
			`#define EIGEN_GPU_MODULE_H`

			`#include "Core"`

			`#include "src/Core/util/DisableStupidWarnings.h"`

			`/** \defgroup GPU_Module GPU module`
			`*`
			`* GPU-accelerated solvers and operations using NVIDIA CUDA libraries`
			`* (cuSOLVER, cuBLAS, cuSPARSE, cuFFT, cuDSS).`
			`*`
			`* This module provides explicit GPU solver classes that coexist with Eigen's`
			`* CPU solvers. Unlike the LAPACKE dispatch (which replaces the CPU`
			`* implementation globally), GPU classes are separate types the user`
			`* instantiates by choice:`
			`*`
			`* \code`
			`* #define EIGEN_USE_GPU`
			`* #include <Eigen/GPU>`
			`*`
			`* // CPU path (unchanged)`
			`* Eigen::LLT<Eigen::MatrixXd> llt_cpu(A);`
			`*`
			`* // GPU path (explicit)`
			`* Eigen::GpuLLT<double> llt_gpu(A); // L stays on device`
			`* auto X = llt_gpu.solve(B); // only B transferred per solve`
			`* \endcode`
			`*`
			`* Requires CUDA 11.4+. See CLAUDE.md.`
			`*/`

			`#ifdef EIGEN_USE_GPU`
			`// IWYU pragma: begin_exports`
GPU: Add BLAS-1 ops, DeviceScalar, device-resident SpMV, and CG interop (5/5) Add the operator interface needed for GPU iterative solvers: - BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(), noalias(), operator+=/-=/\= dispatching to cuBLAS axpy/scal/dot/nrm2. - DeviceScalar<Scalar>: device-resident scalar returned by reductions. Defers host sync until value is read (implicit conversion). Device-side division via NPP for real types. - GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(), cusparseHandle(). - GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm selection and plan caching. - GpuSparseContext: GpuContext& constructor for same-stream execution, deviceView() returning DeviceSparseView with operator for device-resident SpMV (d_y = d_A * d_x). - geam expressions: d_C = d_A + alpha * d_B via cublasXgeam. - GpuSVD::matrixV() convenience wrapper. These additions make DeviceMatrix usable as a VectorType in Eigen algorithm templates. Conjugate gradient is the motivating example and is tested against CPU ConjugateGradient for correctness. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-09 19:54:13 -07:00			`#include "src/GPU/DeviceScalar.h"`
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER) Add Eigen/GPU module: A standalone GPU library dispatch layer where DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls. CPU and GPU solvers coexist in the same binary with compatible syntax. Core infrastructure: - DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with async host transfer (fromHost/toHost) and CUDA event-based cross-stream synchronization. - GpuContext: Unified execution context owning a CUDA stream + cuBLAS handle + cuSOLVER handle. Thread-local default with explicit override via setThreadLocal(). Stream-borrowing constructor for integration. - DeviceBuffer: Typed RAII device allocation with move semantics. cuBLAS dispatch (expression syntax): - GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm) - TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm) - SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm) - SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk) cuSOLVER dispatch: - GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs) - GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs) - Solver chaining: auto x = d_A.llt().solve(d_B) - Solver expressions with .device(ctx) for explicit stream control. CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11, Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit. Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp, gpu_device_matrix.cpp, gpu_library_example.cu Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp, bench_gpu_batching.cpp 2026-04-09 16:15:39 -07:00			`#include "src/GPU/DeviceMatrix.h"`
			`#include "src/GPU/GpuContext.h"`
			`#include "src/GPU/DeviceExpr.h"`
			`#include "src/GPU/DeviceBlasExpr.h"`
			`#include "src/GPU/DeviceSolverExpr.h"`
			`#include "src/GPU/DeviceDispatch.h"`
			`#include "src/GPU/GpuLLT.h"`
			`#include "src/GPU/GpuLU.h"`
GPU: Add dense cuSOLVER solvers (QR, SVD, EigenSolver) Add QR (geqrf + ormqr + trsm), SVD (gesvd), and self-adjoint eigenvalue decomposition (syevd) via cuSOLVER. All support host and DeviceMatrix input. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-09 19:11:25 -07:00			`#include "src/GPU/GpuQR.h"`
			`#include "src/GPU/GpuSVD.h"`
			`#include "src/GPU/GpuEigenSolver.h"`
GPU: Add sparse solvers, FFT, and SpMV (cuDSS, cuFFT, cuSPARSE) Add GPU sparse direct solvers (Cholesky, LDL^T, LU) via cuDSS, 1D/2D FFT via cuFFT with plan caching, and sparse matrix-vector/matrix multiply (SpMV/SpMM) via cuSPARSE. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-09 19:11:49 -07:00			`#include "src/GPU/CuFftSupport.h"`
			`#include "src/GPU/GpuFFT.h"`
			`#include "src/GPU/CuSparseSupport.h"`
GPU: Add BLAS-1 ops, DeviceScalar, device-resident SpMV, and CG interop (5/5) Add the operator interface needed for GPU iterative solvers: - BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(), noalias(), operator+=/-=/\= dispatching to cuBLAS axpy/scal/dot/nrm2. - DeviceScalar<Scalar>: device-resident scalar returned by reductions. Defers host sync until value is read (implicit conversion). Device-side division via NPP for real types. - GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(), cusparseHandle(). - GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm selection and plan caching. - GpuSparseContext: GpuContext& constructor for same-stream execution, deviceView() returning DeviceSparseView with operator for device-resident SpMV (d_y = d_A * d_x). - geam expressions: d_C = d_A + alpha * d_B via cublasXgeam. - GpuSVD::matrixV() convenience wrapper. These additions make DeviceMatrix usable as a VectorType in Eigen algorithm templates. Conjugate gradient is the motivating example and is tested against CPU ConjugateGradient for correctness. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-09 19:54:13 -07:00			`#ifdef EIGEN_SPARSECORE_MODULE_H`
GPU: Add sparse solvers, FFT, and SpMV (cuDSS, cuFFT, cuSPARSE) Add GPU sparse direct solvers (Cholesky, LDL^T, LU) via cuDSS, 1D/2D FFT via cuFFT with plan caching, and sparse matrix-vector/matrix multiply (SpMV/SpMM) via cuSPARSE. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-09 19:11:49 -07:00			`#include "src/GPU/GpuSparseContext.h"`
GPU: Add BLAS-1 ops, DeviceScalar, device-resident SpMV, and CG interop (5/5) Add the operator interface needed for GPU iterative solvers: - BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(), noalias(), operator+=/-=/\= dispatching to cuBLAS axpy/scal/dot/nrm2. - DeviceScalar<Scalar>: device-resident scalar returned by reductions. Defers host sync until value is read (implicit conversion). Device-side division via NPP for real types. - GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(), cusparseHandle(). - GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm selection and plan caching. - GpuSparseContext: GpuContext& constructor for same-stream execution, deviceView() returning DeviceSparseView with operator for device-resident SpMV (d_y = d_A * d_x). - geam expressions: d_C = d_A + alpha * d_B via cublasXgeam. - GpuSVD::matrixV() convenience wrapper. These additions make DeviceMatrix usable as a VectorType in Eigen algorithm templates. Conjugate gradient is the motivating example and is tested against CPU ConjugateGradient for correctness. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-09 19:54:13 -07:00			`#endif`
			`#if defined(EIGEN_CUDSS) && defined(EIGEN_SPARSECORE_MODULE_H)`
GPU: Add sparse solvers, FFT, and SpMV (cuDSS, cuFFT, cuSPARSE) Add GPU sparse direct solvers (Cholesky, LDL^T, LU) via cuDSS, 1D/2D FFT via cuFFT with plan caching, and sparse matrix-vector/matrix multiply (SpMV/SpMM) via cuSPARSE. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-09 19:11:49 -07:00			`#include "src/GPU/CuDssSupport.h"`
			`#include "src/GPU/GpuSparseSolverBase.h"`
			`#include "src/GPU/GpuSparseLLT.h"`
			`#include "src/GPU/GpuSparseLDLT.h"`
			`#include "src/GPU/GpuSparseLU.h"`
			`#endif`
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER) Add Eigen/GPU module: A standalone GPU library dispatch layer where DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls. CPU and GPU solvers coexist in the same binary with compatible syntax. Core infrastructure: - DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with async host transfer (fromHost/toHost) and CUDA event-based cross-stream synchronization. - GpuContext: Unified execution context owning a CUDA stream + cuBLAS handle + cuSOLVER handle. Thread-local default with explicit override via setThreadLocal(). Stream-borrowing constructor for integration. - DeviceBuffer: Typed RAII device allocation with move semantics. cuBLAS dispatch (expression syntax): - GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm) - TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm) - SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm) - SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk) cuSOLVER dispatch: - GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs) - GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs) - Solver chaining: auto x = d_A.llt().solve(d_B) - Solver expressions with .device(ctx) for explicit stream control. CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11, Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit. Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp, gpu_device_matrix.cpp, gpu_library_example.cu Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp, bench_gpu_batching.cpp 2026-04-09 16:15:39 -07:00			`// IWYU pragma: end_exports`
			`#endif`

			`#include "src/Core/util/ReenableStupidWarnings.h"`

			`#endif // EIGEN_GPU_MODULE_H`