Compare commits

...

2 Commits

Author SHA1 Message Date
Rasmus Munk Larsen
43a95b62bb GPU: Add sparse solvers, FFT, and SpMV (cuDSS, cuFFT, cuSPARSE)
Add GPU sparse direct solvers (Cholesky, LDL^T, LU) via cuDSS, 1D/2D FFT
via cuFFT with plan caching, and sparse matrix-vector/matrix multiply
(SpMV/SpMM) via cuSPARSE.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 19:11:49 -07:00
Rasmus Munk Larsen
8593c7f5a1 GPU: Add dense cuSOLVER solvers (QR, SVD, EigenSolver)
Add QR (geqrf + ormqr + trsm), SVD (gesvd), and self-adjoint eigenvalue
decomposition (syevd) via cuSOLVER. All support host and DeviceMatrix input.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 19:11:34 -07:00
33 changed files with 4908 additions and 161 deletions

View File

@@ -47,6 +47,20 @@
#include "src/GPU/DeviceDispatch.h" #include "src/GPU/DeviceDispatch.h"
#include "src/GPU/GpuLLT.h" #include "src/GPU/GpuLLT.h"
#include "src/GPU/GpuLU.h" #include "src/GPU/GpuLU.h"
#include "src/GPU/GpuQR.h"
#include "src/GPU/GpuSVD.h"
#include "src/GPU/GpuEigenSolver.h"
#include "src/GPU/CuFftSupport.h"
#include "src/GPU/GpuFFT.h"
#include "src/GPU/CuSparseSupport.h"
#include "src/GPU/GpuSparseContext.h"
#ifdef EIGEN_CUDSS
#include "src/GPU/CuDssSupport.h"
#include "src/GPU/GpuSparseSolverBase.h"
#include "src/GPU/GpuSparseLLT.h"
#include "src/GPU/GpuSparseLDLT.h"
#include "src/GPU/GpuSparseLU.h"
#endif
// IWYU pragma: end_exports // IWYU pragma: end_exports
#endif #endif

View File

@@ -51,25 +51,69 @@ constexpr cublasOperation_t to_cublas_op(GpuOp op) {
// ---- Scalar → cublasComputeType_t ------------------------------------------- // ---- Scalar → cublasComputeType_t -------------------------------------------
// cublasGemmEx requires a compute type (separate from the data type). // cublasGemmEx requires a compute type (separate from the data type).
//
// Precision policy:
// - Default: tensor core algorithms enabled (CUBLAS_GEMM_DEFAULT_TENSOR_OP).
// For double, cuBLAS may use Ozaki emulation on sm_80+ tensor cores.
// - EIGEN_CUDA_TF32: opt-in to TF32 for float (~2x faster, 10-bit mantissa).
// - EIGEN_NO_CUDA_TENSOR_OPS: disables all tensor core usage. Uses pedantic
// compute types and CUBLAS_GEMM_DEFAULT algorithm. For bit-exact reproducibility.
template <typename Scalar> template <typename Scalar>
struct cuda_compute_type; struct cuda_compute_type;
template <> template <>
struct cuda_compute_type<float> { struct cuda_compute_type<float> {
#if defined(EIGEN_NO_CUDA_TENSOR_OPS)
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_PEDANTIC;
#elif defined(EIGEN_CUDA_TF32)
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_FAST_TF32;
#else
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F; static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
#endif
}; };
template <> template <>
struct cuda_compute_type<double> { struct cuda_compute_type<double> {
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F_PEDANTIC;
#else
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F; static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
#endif
}; };
template <> template <>
struct cuda_compute_type<std::complex<float>> { struct cuda_compute_type<std::complex<float>> {
#if defined(EIGEN_NO_CUDA_TENSOR_OPS)
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_PEDANTIC;
#elif defined(EIGEN_CUDA_TF32)
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_FAST_TF32;
#else
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F; static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
#endif
}; };
template <> template <>
struct cuda_compute_type<std::complex<double>> { struct cuda_compute_type<std::complex<double>> {
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F_PEDANTIC;
#else
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F; static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
#endif
};
// ---- GEMM algorithm hint ----------------------------------------------------
constexpr cublasGemmAlgo_t cuda_gemm_algo() {
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
return CUBLAS_GEMM_DEFAULT;
#else
return CUBLAS_GEMM_DEFAULT_TENSOR_OP;
#endif
}
// ---- Alpha/beta scalar type for cublasGemmEx --------------------------------
// For standard types, alpha/beta match the scalar type.
template <typename Scalar>
struct cuda_gemm_scalar {
using type = Scalar;
}; };
// ---- Type-specific cuBLAS wrappers ------------------------------------------ // ---- Type-specific cuBLAS wrappers ------------------------------------------
@@ -154,6 +198,35 @@ inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cubla
reinterpret_cast<cuDoubleComplex*>(C), ldc); reinterpret_cast<cuDoubleComplex*>(C), ldc);
} }
// GEAM wrappers: C = alpha * op(A) + beta * op(B)
// Covers transpose, scale, matrix add/subtract in one call.
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
const float* alpha, const float* A, int lda, const float* beta, const float* B,
int ldb, float* C, int ldc) {
return cublasSgeam(h, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
}
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
const double* alpha, const double* A, int lda, const double* beta, const double* B,
int ldb, double* C, int ldc) {
return cublasDgeam(h, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
}
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
const std::complex<float>* alpha, const std::complex<float>* A, int lda,
const std::complex<float>* beta, const std::complex<float>* B, int ldb,
std::complex<float>* C, int ldc) {
return cublasCgeam(h, transa, transb, m, n, reinterpret_cast<const cuComplex*>(alpha),
reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<const cuComplex*>(beta),
reinterpret_cast<const cuComplex*>(B), ldb, reinterpret_cast<cuComplex*>(C), ldc);
}
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
const std::complex<double>* alpha, const std::complex<double>* A, int lda,
const std::complex<double>* beta, const std::complex<double>* B, int ldb,
std::complex<double>* C, int ldc) {
return cublasZgeam(h, transa, transb, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<const cuDoubleComplex*>(beta),
reinterpret_cast<const cuDoubleComplex*>(B), ldb, reinterpret_cast<cuDoubleComplex*>(C), ldc);
}
} // namespace internal } // namespace internal
} // namespace Eigen } // namespace Eigen

View File

@@ -0,0 +1,134 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// cuDSS support utilities: error checking macro, type mapping.
//
// cuDSS is NVIDIA's sparse direct solver library, supporting Cholesky (LL^T),
// LDL^T, and LU factorization on GPU. It requires CUDA 12.0+ and is
// distributed separately from the CUDA Toolkit.
#ifndef EIGEN_GPU_CUDSS_SUPPORT_H
#define EIGEN_GPU_CUDSS_SUPPORT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSupport.h"
#include <cudss.h>
namespace Eigen {
namespace internal {
// ---- Error checking ---------------------------------------------------------
#define EIGEN_CUDSS_CHECK(x) \
do { \
cudssStatus_t _s = (x); \
eigen_assert(_s == CUDSS_STATUS_SUCCESS && "cuDSS call failed: " #x); \
EIGEN_UNUSED_VARIABLE(_s); \
} while (0)
// ---- Scalar → cudssMatrixType_t for SPD/HPD ---------------------------------
template <typename Scalar>
struct cudss_spd_type;
template <>
struct cudss_spd_type<float> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
};
template <>
struct cudss_spd_type<double> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
};
template <>
struct cudss_spd_type<std::complex<float>> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
};
template <>
struct cudss_spd_type<std::complex<double>> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
};
// ---- Scalar → cudssMatrixType_t for symmetric/Hermitian ---------------------
template <typename Scalar>
struct cudss_symmetric_type;
template <>
struct cudss_symmetric_type<float> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
};
template <>
struct cudss_symmetric_type<double> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
};
template <>
struct cudss_symmetric_type<std::complex<float>> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
};
template <>
struct cudss_symmetric_type<std::complex<double>> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
};
// ---- StorageIndex → cudaDataType_t ------------------------------------------
template <typename StorageIndex>
struct cudss_index_type;
template <>
struct cudss_index_type<int> {
static constexpr cudaDataType_t value = CUDA_R_32I;
};
template <>
struct cudss_index_type<int64_t> {
static constexpr cudaDataType_t value = CUDA_R_64I;
};
// ---- UpLo → cudssMatrixViewType_t -------------------------------------------
// For symmetric matrices stored as CSC (ColMajor), cuDSS sees CSR of A^T.
// Since A = A^T, the data is the same, but the triangle view must be swapped.
template <int UpLo, int StorageOrder>
struct cudss_view_type;
// ColMajor (CSC) passed as CSR: lower ↔ upper swap.
template <>
struct cudss_view_type<Lower, ColMajor> {
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
};
template <>
struct cudss_view_type<Upper, ColMajor> {
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
};
// RowMajor (CSR) passed directly: no swap needed.
template <>
struct cudss_view_type<Lower, RowMajor> {
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
};
template <>
struct cudss_view_type<Upper, RowMajor> {
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
};
} // namespace internal
// ---- Ordering enum ----------------------------------------------------------
enum class GpuSparseOrdering {
AMD, // Default fill-reducing ordering
METIS, // METIS nested dissection
RCM // Reverse Cuthill-McKee
};
} // namespace Eigen
#endif // EIGEN_GPU_CUDSS_SUPPORT_H

View File

@@ -0,0 +1,103 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// cuFFT support utilities: error checking macro, type mapping.
#ifndef EIGEN_GPU_CUFFT_SUPPORT_H
#define EIGEN_GPU_CUFFT_SUPPORT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSupport.h"
#include <cufft.h>
namespace Eigen {
namespace internal {
// ---- Error checking ---------------------------------------------------------
#define EIGEN_CUFFT_CHECK(x) \
do { \
cufftResult _r = (x); \
eigen_assert(_r == CUFFT_SUCCESS && "cuFFT call failed: " #x); \
EIGEN_UNUSED_VARIABLE(_r); \
} while (0)
// ---- Scalar → cufftType traits ----------------------------------------------
template <typename Scalar>
struct cufft_c2c_type;
template <>
struct cufft_c2c_type<float> {
static constexpr cufftType value = CUFFT_C2C;
};
template <>
struct cufft_c2c_type<double> {
static constexpr cufftType value = CUFFT_Z2Z;
};
template <typename Scalar>
struct cufft_r2c_type;
template <>
struct cufft_r2c_type<float> {
static constexpr cufftType value = CUFFT_R2C;
};
template <>
struct cufft_r2c_type<double> {
static constexpr cufftType value = CUFFT_D2Z;
};
template <typename Scalar>
struct cufft_c2r_type;
template <>
struct cufft_c2r_type<float> {
static constexpr cufftType value = CUFFT_C2R;
};
template <>
struct cufft_c2r_type<double> {
static constexpr cufftType value = CUFFT_Z2D;
};
// ---- Type-dispatched cuFFT execution ----------------------------------------
// C2C
inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<float>* in, std::complex<float>* out,
int direction) {
return cufftExecC2C(plan, reinterpret_cast<cufftComplex*>(in), reinterpret_cast<cufftComplex*>(out), direction);
}
inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<double>* in, std::complex<double>* out,
int direction) {
return cufftExecZ2Z(plan, reinterpret_cast<cufftDoubleComplex*>(in), reinterpret_cast<cufftDoubleComplex*>(out),
direction);
}
// R2C
inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, float* in, std::complex<float>* out) {
return cufftExecR2C(plan, in, reinterpret_cast<cufftComplex*>(out));
}
inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, double* in, std::complex<double>* out) {
return cufftExecD2Z(plan, in, reinterpret_cast<cufftDoubleComplex*>(out));
}
// C2R
inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<float>* in, float* out) {
return cufftExecC2R(plan, reinterpret_cast<cufftComplex*>(in), out);
}
inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<double>* in, double* out) {
return cufftExecZ2D(plan, reinterpret_cast<cufftDoubleComplex*>(in), out);
}
} // namespace internal
} // namespace Eigen
#endif // EIGEN_GPU_CUFFT_SUPPORT_H

View File

@@ -91,6 +91,68 @@ struct cusolver_fill_mode<Upper, RowMajor> {
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER; static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
}; };
// ---- Type-specific cuSOLVER wrappers ----------------------------------------
// cuSOLVER does not provide generic X variants for ormqr/unmqr. These overloaded
// wrappers dispatch to the correct type-specific function.
// For real types: ormqr (orthogonal Q). For complex types: unmqr (unitary Q).
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
int n, int k, const float* A, int lda, const float* tau, float* C, int ldc,
float* work, int lwork, int* info) {
return cusolverDnSormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
}
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
int n, int k, const double* A, int lda, const double* tau, double* C, int ldc,
double* work, int lwork, int* info) {
return cusolverDnDormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
}
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
int n, int k, const std::complex<float>* A, int lda,
const std::complex<float>* tau, std::complex<float>* C, int ldc,
std::complex<float>* work, int lwork, int* info) {
return cusolverDnCunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<cuComplex*>(C), ldc,
reinterpret_cast<cuComplex*>(work), lwork, info);
}
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
int n, int k, const std::complex<double>* A, int lda,
const std::complex<double>* tau, std::complex<double>* C, int ldc,
std::complex<double>* work, int lwork, int* info) {
return cusolverDnZunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
reinterpret_cast<const cuDoubleComplex*>(tau), reinterpret_cast<cuDoubleComplex*>(C), ldc,
reinterpret_cast<cuDoubleComplex*>(work), lwork, info);
}
// Buffer size wrappers for ormqr/unmqr.
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
cublasOperation_t trans, int m, int n, int k, const float* A,
int lda, const float* tau, const float* C, int ldc, int* lwork) {
return cusolverDnSormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
}
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
cublasOperation_t trans, int m, int n, int k, const double* A,
int lda, const double* tau, const double* C, int ldc, int* lwork) {
return cusolverDnDormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
}
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
cublasOperation_t trans, int m, int n, int k,
const std::complex<float>* A, int lda,
const std::complex<float>* tau, const std::complex<float>* C,
int ldc, int* lwork) {
return cusolverDnCunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<const cuComplex*>(C),
ldc, lwork);
}
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
cublasOperation_t trans, int m, int n, int k,
const std::complex<double>* A, int lda,
const std::complex<double>* tau, const std::complex<double>* C,
int ldc, int* lwork) {
return cusolverDnZunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
reinterpret_cast<const cuDoubleComplex*>(tau),
reinterpret_cast<const cuDoubleComplex*>(C), ldc, lwork);
}
} // namespace internal } // namespace internal
} // namespace Eigen } // namespace Eigen

View File

@@ -0,0 +1,34 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// cuSPARSE support utilities: error checking macro.
#ifndef EIGEN_GPU_CUSPARSE_SUPPORT_H
#define EIGEN_GPU_CUSPARSE_SUPPORT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSupport.h"
#include <cusparse.h>
namespace Eigen {
namespace internal {
#define EIGEN_CUSPARSE_CHECK(x) \
do { \
cusparseStatus_t _s = (x); \
eigen_assert(_s == CUSPARSE_STATUS_SUCCESS && "cuSPARSE call failed: " #x); \
EIGEN_UNUSED_VARIABLE(_s); \
} while (0)
} // namespace internal
} // namespace Eigen
#endif // EIGEN_GPU_CUSPARSE_SUPPORT_H

View File

@@ -58,8 +58,8 @@ void dispatch_gemm(
eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch"); eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch");
const int64_t lda = A.outerStride(); const int64_t lda = A.rows();
const int64_t ldb = B.outerStride(); const int64_t ldb = B.rows();
// Serialize all accesses to the destination buffer on this stream. // Serialize all accesses to the destination buffer on this stream.
if (!dst.empty()) { if (!dst.empty()) {
@@ -71,9 +71,13 @@ void dispatch_gemm(
if (resized) { if (resized) {
dst.resize(m, n); dst.resize(m, n);
} }
const int64_t ldc = dst.outerStride(); const int64_t ldc = dst.rows();
Scalar alpha_val = alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs()); // cuBLAS requires alpha/beta as float for half/bfloat16 inputs.
using GemmScalar = typename cuda_gemm_scalar<Scalar>::type;
GemmScalar alpha_gval =
static_cast<GemmScalar>(alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs()));
GemmScalar beta_gval = static_cast<GemmScalar>(beta_val);
// Wait for operands to be ready on this stream. // Wait for operands to be ready on this stream.
A.waitReady(ctx.stream()); A.waitReady(ctx.stream());
@@ -81,7 +85,7 @@ void dispatch_gemm(
// If there is no existing valid destination to accumulate into, treat it as // If there is no existing valid destination to accumulate into, treat it as
// zero rather than reading uninitialized memory. // zero rather than reading uninitialized memory.
if (resized && beta_val != Scalar(0) && dst.sizeInBytes() > 0) { if (resized && beta_gval != GemmScalar(0) && dst.sizeInBytes() > 0) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream())); EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
} }
@@ -89,9 +93,9 @@ void dispatch_gemm(
constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value; constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
EIGEN_CUBLAS_CHECK(cublasGemmEx(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n), EIGEN_CUBLAS_CHECK(cublasGemmEx(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n),
static_cast<int>(k), &alpha_val, A.data(), dtype, static_cast<int>(lda), B.data(), static_cast<int>(k), &alpha_gval, A.data(), dtype, static_cast<int>(lda), B.data(),
dtype, static_cast<int>(ldb), &beta_val, dst.data(), dtype, static_cast<int>(ldc), dtype, static_cast<int>(ldb), &beta_gval, dst.data(), dtype, static_cast<int>(ldc),
compute, CUBLAS_GEMM_DEFAULT)); compute, cuda_gemm_algo()));
dst.recordReady(ctx.stream()); dst.recordReady(ctx.stream());
} }
@@ -125,9 +129,9 @@ void dispatch_llt_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LltSol
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value; constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo, ColMajor>::value; constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo, ColMajor>::value;
const int64_t lda = static_cast<int64_t>(A.outerStride()); const int64_t lda = static_cast<int64_t>(A.rows());
const int64_t ldb = static_cast<int64_t>(B.outerStride()); const int64_t ldb = static_cast<int64_t>(B.rows());
eigen_assert(ldb == static_cast<int64_t>(B.rows()) && "DeviceMatrix must be densely packed");
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar); const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar); const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
@@ -163,7 +167,7 @@ void dispatch_llt_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LltSol
// Solve. // Solve.
DeviceBuffer d_solve_info(sizeof(int)); DeviceBuffer d_solve_info(sizeof(int));
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), nrhs, dtype, EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), nrhs, dtype,
d_factor.ptr, lda, dtype, dst.data(), static_cast<int64_t>(dst.outerStride()), d_factor.ptr, lda, dtype, dst.data(), static_cast<int64_t>(dst.rows()),
static_cast<int*>(d_solve_info.ptr))); static_cast<int*>(d_solve_info.ptr)));
// Sync to ensure workspace locals can be freed safely. // Sync to ensure workspace locals can be freed safely.
@@ -201,9 +205,9 @@ void dispatch_lu_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LuSolve
if (!dst.empty()) dst.waitReady(ctx.stream()); if (!dst.empty()) dst.waitReady(ctx.stream());
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value; constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
const int64_t lda = static_cast<int64_t>(A.outerStride()); const int64_t lda = static_cast<int64_t>(A.rows());
const int64_t ldb = static_cast<int64_t>(B.outerStride()); const int64_t ldb = static_cast<int64_t>(B.rows());
eigen_assert(ldb == static_cast<int64_t>(B.rows()) && "DeviceMatrix must be densely packed");
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar); const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar); const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t); const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t);
@@ -245,7 +249,7 @@ void dispatch_lu_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LuSolve
DeviceBuffer d_solve_info(sizeof(int)); DeviceBuffer d_solve_info(sizeof(int));
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, static_cast<int64_t>(n), nrhs, EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, static_cast<int64_t>(n), nrhs,
dtype, d_lu.ptr, lda, static_cast<const int64_t*>(d_ipiv.ptr), dtype, dtype, d_lu.ptr, lda, static_cast<const int64_t*>(d_ipiv.ptr), dtype,
dst.data(), static_cast<int64_t>(dst.outerStride()), dst.data(), static_cast<int64_t>(dst.rows()),
static_cast<int*>(d_solve_info.ptr))); static_cast<int*>(d_solve_info.ptr)));
// Sync to ensure workspace locals can be freed safely. // Sync to ensure workspace locals can be freed safely.
@@ -285,15 +289,15 @@ void dispatch_trsm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const TrsmExpr<Sc
// D2D copy B → dst (trsm is in-place on the RHS). // D2D copy B → dst (trsm is in-place on the RHS).
dst.resize(n, B.cols()); dst.resize(n, B.cols());
const size_t rhs_bytes = static_cast<size_t>(dst.outerStride()) * static_cast<size_t>(nrhs) * sizeof(Scalar); const size_t rhs_bytes = static_cast<size_t>(dst.rows()) * static_cast<size_t>(nrhs) * sizeof(Scalar);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream())); EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
Scalar alpha(1); Scalar alpha(1);
EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs, EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs,
&alpha, A.data(), static_cast<int>(A.outerStride()), dst.data(), &alpha, A.data(), static_cast<int>(A.rows()), dst.data(),
static_cast<int>(dst.outerStride()))); static_cast<int>(dst.rows())));
dst.recordReady(ctx.stream()); dst.recordReady(ctx.stream());
} }
@@ -329,8 +333,8 @@ void dispatch_symm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SymmExpr<Sc
Scalar alpha(1), beta(0); Scalar alpha(1), beta(0);
EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &alpha, A.data(), EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &alpha, A.data(),
static_cast<int>(A.outerStride()), B.data(), static_cast<int>(B.outerStride()), &beta, static_cast<int>(A.rows()), B.data(), static_cast<int>(B.rows()), &beta, dst.data(),
dst.data(), static_cast<int>(dst.outerStride()))); static_cast<int>(dst.rows())));
dst.recordReady(ctx.stream()); dst.recordReady(ctx.stream());
} }
@@ -367,8 +371,7 @@ void dispatch_syrk(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SyrkExpr<Sc
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(), EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(),
static_cast<int>(A.outerStride()), &beta_val, dst.data(), static_cast<int>(A.rows()), &beta_val, dst.data(), static_cast<int>(dst.rows())));
static_cast<int>(dst.outerStride())));
dst.recordReady(ctx.stream()); dst.recordReady(ctx.stream());
} }

View File

@@ -10,7 +10,7 @@
// Typed RAII wrapper for a dense matrix in GPU device memory. // Typed RAII wrapper for a dense matrix in GPU device memory.
// //
// DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked // DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked
// dimensions and leading dimension. It can be passed to GPU solvers // dimensions. Always dense (leading dimension = rows). It can be passed to GPU solvers
// (GpuLLT, GpuLU, future cuBLAS/cuDSS) without host round-trips. // (GpuLLT, GpuLU, future cuBLAS/cuDSS) without host round-trips.
// //
// Cross-stream safety is automatic: an internal CUDA event tracks when the // Cross-stream safety is automatic: an internal CUDA event tracks when the
@@ -25,7 +25,7 @@
// MatrixXd X = d_X.toHost(); // download + block // MatrixXd X = d_X.toHost(); // download + block
// //
// Async variants: // Async variants:
// auto d_A = DeviceMatrix<double>::fromHostAsync(A.data(), n, n, n, stream); // auto d_A = DeviceMatrix<double>::fromHostAsync(A.data(), n, n, stream);
// auto transfer = d_X.toHostAsync(stream); // enqueue D2H // auto transfer = d_X.toHostAsync(stream); // enqueue D2H
// // ... overlap with other work ... // // ... overlap with other work ...
// MatrixXd X = transfer.get(); // block + retrieve // MatrixXd X = transfer.get(); // block + retrieve
@@ -157,7 +157,8 @@ class HostTransfer {
* *
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double> * \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
* *
* Owns a device allocation with tracked dimensions and leading dimension. * Owns a device allocation with tracked dimensions. Always dense
* (leading dimension = rows; no stride padding).
* An internal CUDA event records when the data was last written, enabling * An internal CUDA event records when the data was last written, enabling
* safe cross-stream consumption without user-visible synchronization. * safe cross-stream consumption without user-visible synchronization.
* *
@@ -177,7 +178,7 @@ class DeviceMatrix {
DeviceMatrix() = default; DeviceMatrix() = default;
/** Allocate uninitialized device memory for a rows x cols matrix. */ /** Allocate uninitialized device memory for a rows x cols matrix. */
DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols), outerStride_(rows) { DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols) {
eigen_assert(rows >= 0 && cols >= 0); eigen_assert(rows >= 0 && cols >= 0);
size_t bytes = sizeInBytes(); size_t bytes = sizeInBytes();
if (bytes > 0) { if (bytes > 0) {
@@ -196,14 +197,12 @@ class DeviceMatrix {
: data_(o.data_), : data_(o.data_),
rows_(o.rows_), rows_(o.rows_),
cols_(o.cols_), cols_(o.cols_),
outerStride_(o.outerStride_),
ready_event_(o.ready_event_), ready_event_(o.ready_event_),
ready_stream_(o.ready_stream_), ready_stream_(o.ready_stream_),
retained_buffer_(std::move(o.retained_buffer_)) { retained_buffer_(std::move(o.retained_buffer_)) {
o.data_ = nullptr; o.data_ = nullptr;
o.rows_ = 0; o.rows_ = 0;
o.cols_ = 0; o.cols_ = 0;
o.outerStride_ = 0;
o.ready_event_ = nullptr; o.ready_event_ = nullptr;
o.ready_stream_ = nullptr; o.ready_stream_ = nullptr;
} }
@@ -215,14 +214,12 @@ class DeviceMatrix {
data_ = o.data_; data_ = o.data_;
rows_ = o.rows_; rows_ = o.rows_;
cols_ = o.cols_; cols_ = o.cols_;
outerStride_ = o.outerStride_;
ready_event_ = o.ready_event_; ready_event_ = o.ready_event_;
ready_stream_ = o.ready_stream_; ready_stream_ = o.ready_stream_;
retained_buffer_ = std::move(o.retained_buffer_); retained_buffer_ = std::move(o.retained_buffer_);
o.data_ = nullptr; o.data_ = nullptr;
o.rows_ = 0; o.rows_ = 0;
o.cols_ = 0; o.cols_ = 0;
o.outerStride_ = 0;
o.ready_event_ = nullptr; o.ready_event_ = nullptr;
o.ready_stream_ = nullptr; o.ready_stream_ = nullptr;
} }
@@ -259,29 +256,17 @@ class DeviceMatrix {
* The caller must keep \p host_data alive until the transfer completes * The caller must keep \p host_data alive until the transfer completes
* (check via the internal event or synchronize the stream). * (check via the internal event or synchronize the stream).
* *
* \param host_data Pointer to contiguous column-major host data. * \param host_data Pointer to contiguous column-major host data.
* \param rows Number of rows. * \param rows Number of rows.
* \param cols Number of columns. * \param cols Number of columns.
* \param outerStride Leading dimension (>= rows). Use rows for dense. * \param stream CUDA stream for the transfer.
* \param stream CUDA stream for the transfer.
*/ */
static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, Index outerStride, static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, cudaStream_t stream) {
cudaStream_t stream) { eigen_assert(rows >= 0 && cols >= 0);
eigen_assert(rows >= 0 && cols >= 0 && outerStride >= rows);
eigen_assert(host_data != nullptr || (rows == 0 || cols == 0)); eigen_assert(host_data != nullptr || (rows == 0 || cols == 0));
DeviceMatrix dm(rows, cols); DeviceMatrix dm(rows, cols);
if (dm.sizeInBytes() > 0) { if (dm.sizeInBytes() > 0) {
// If outerStride == rows (dense), single contiguous copy. EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dm.data_, host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
// Otherwise, copy column by column (strided layout).
if (outerStride == rows) {
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(dm.data_, host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
} else {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(dm.data_, static_cast<size_t>(rows) * sizeof(Scalar), host_data,
static_cast<size_t>(outerStride) * sizeof(Scalar),
static_cast<size_t>(rows) * sizeof(Scalar),
static_cast<size_t>(cols), cudaMemcpyHostToDevice, stream));
}
dm.recordReady(stream); dm.recordReady(stream);
} }
return dm; return dm;
@@ -360,7 +345,6 @@ class DeviceMatrix {
retained_buffer_ = internal::DeviceBuffer(); retained_buffer_ = internal::DeviceBuffer();
rows_ = rows; rows_ = rows;
cols_ = cols; cols_ = cols;
outerStride_ = rows;
size_t bytes = sizeInBytes(); size_t bytes = sizeInBytes();
if (bytes > 0) { if (bytes > 0) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes)); EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
@@ -373,11 +357,10 @@ class DeviceMatrix {
const Scalar* data() const { return data_; } const Scalar* data() const { return data_; }
Index rows() const { return rows_; } Index rows() const { return rows_; }
Index cols() const { return cols_; } Index cols() const { return cols_; }
Index outerStride() const { return outerStride_; }
bool empty() const { return rows_ == 0 || cols_ == 0; } bool empty() const { return rows_ == 0 || cols_ == 0; }
/** Size of the device allocation in bytes. */ /** Size of the device allocation in bytes. */
size_t sizeInBytes() const { return static_cast<size_t>(outerStride_) * static_cast<size_t>(cols_) * sizeof(Scalar); } size_t sizeInBytes() const { return static_cast<size_t>(rows_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
// ---- Event synchronization (public for library dispatch interop) --------- // ---- Event synchronization (public for library dispatch interop) ---------
@@ -466,8 +449,7 @@ class DeviceMatrix {
private: private:
// ---- Private: adopt a raw device pointer (used by friend solvers) -------- // ---- Private: adopt a raw device pointer (used by friend solvers) --------
DeviceMatrix(Scalar* device_ptr, Index rows, Index cols, Index outerStride) DeviceMatrix(Scalar* device_ptr, Index rows, Index cols) : data_(device_ptr), rows_(rows), cols_(cols) {}
: data_(device_ptr), rows_(rows), cols_(cols), outerStride_(outerStride) {}
/** Transfer ownership of the device pointer out. Zeros internal state. */ /** Transfer ownership of the device pointer out. Zeros internal state. */
Scalar* release() { Scalar* release() {
@@ -475,7 +457,6 @@ class DeviceMatrix {
data_ = nullptr; data_ = nullptr;
rows_ = 0; rows_ = 0;
cols_ = 0; cols_ = 0;
outerStride_ = 0;
if (ready_event_) { if (ready_event_) {
(void)cudaEventDestroy(ready_event_); (void)cudaEventDestroy(ready_event_);
ready_event_ = nullptr; ready_event_ = nullptr;
@@ -500,13 +481,18 @@ class DeviceMatrix {
friend class GpuLLT; friend class GpuLLT;
template <typename> template <typename>
friend class GpuLU; friend class GpuLU;
template <typename>
friend class GpuQR;
template <typename>
friend class GpuSVD;
template <typename>
friend class GpuSelfAdjointEigenSolver;
// ---- Data members -------------------------------------------------------- // ---- Data members --------------------------------------------------------
Scalar* data_ = nullptr; Scalar* data_ = nullptr;
Index rows_ = 0; Index rows_ = 0;
Index cols_ = 0; Index cols_ = 0;
Index outerStride_ = 0;
cudaEvent_t ready_event_ = nullptr; // internal: tracks last write completion cudaEvent_t ready_event_ = nullptr; // internal: tracks last write completion
cudaStream_t ready_stream_ = nullptr; // stream that recorded ready_event_ (for same-stream skip) cudaStream_t ready_stream_ = nullptr; // stream that recorded ready_event_ (for same-stream skip)
internal::DeviceBuffer retained_buffer_; // internal: keeps async aux buffers alive internal::DeviceBuffer retained_buffer_; // internal: keeps async aux buffers alive

View File

@@ -0,0 +1,232 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU self-adjoint eigenvalue decomposition using cuSOLVER.
//
// Wraps cusolverDnXsyevd (symmetric/Hermitian divide-and-conquer).
// Stores eigenvalues and eigenvectors on device.
//
// Usage:
// GpuSelfAdjointEigenSolver<double> es(A);
// VectorXd eigenvals = es.eigenvalues();
// MatrixXd eigenvecs = es.eigenvectors();
#ifndef EIGEN_GPU_EIGENSOLVER_H
#define EIGEN_GPU_EIGENSOLVER_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuSolverSupport.h"
#include <vector>
namespace Eigen {
template <typename Scalar_>
class GpuSelfAdjointEigenSolver {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
using RealVector = Matrix<RealScalar, Dynamic, 1>;
/** Eigenvalue-only or eigenvalues + eigenvectors. */
enum ComputeMode { EigenvaluesOnly, ComputeEigenvectors };
GpuSelfAdjointEigenSolver() { init_context(); }
template <typename InputType>
explicit GpuSelfAdjointEigenSolver(const EigenBase<InputType>& A, ComputeMode mode = ComputeEigenvectors) {
init_context();
compute(A, mode);
}
~GpuSelfAdjointEigenSolver() {
if (handle_) (void)cusolverDnDestroy(handle_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
GpuSelfAdjointEigenSolver(const GpuSelfAdjointEigenSolver&) = delete;
GpuSelfAdjointEigenSolver& operator=(const GpuSelfAdjointEigenSolver&) = delete;
// ---- Factorization -------------------------------------------------------
template <typename InputType>
GpuSelfAdjointEigenSolver& compute(const EigenBase<InputType>& A, ComputeMode mode = ComputeEigenvectors) {
eigen_assert(A.rows() == A.cols() && "GpuSelfAdjointEigenSolver requires a square matrix");
mode_ = mode;
n_ = A.rows();
info_ = InvalidInput;
info_synced_ = false;
if (n_ == 0) {
info_ = Success;
info_synced_ = true;
return *this;
}
const PlainMatrix mat(A.derived());
lda_ = static_cast<int64_t>(n_);
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
// syevd overwrites A with eigenvectors (if requested).
d_A_ = internal::DeviceBuffer(mat_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
factorize();
return *this;
}
GpuSelfAdjointEigenSolver& compute(const DeviceMatrix<Scalar>& d_A, ComputeMode mode = ComputeEigenvectors) {
eigen_assert(d_A.rows() == d_A.cols() && "GpuSelfAdjointEigenSolver requires a square matrix");
mode_ = mode;
n_ = d_A.rows();
info_ = InvalidInput;
info_synced_ = false;
if (n_ == 0) {
info_ = Success;
info_synced_ = true;
return *this;
}
d_A.waitReady(stream_);
lda_ = static_cast<int64_t>(n_);
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
d_A_ = internal::DeviceBuffer(mat_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
factorize();
return *this;
}
// ---- Accessors -----------------------------------------------------------
ComputationInfo info() const {
sync_info();
return info_;
}
Index cols() const { return n_; }
Index rows() const { return n_; }
// TODO: Add device-side accessors (deviceEigenvalues(), deviceEigenvectors())
// returning DeviceMatrix views of the internal buffers, so users can chain
// GPU operations without round-tripping through host memory.
/** Eigenvalues in ascending order. Downloads from device. */
RealVector eigenvalues() const {
sync_info();
eigen_assert(info_ == Success);
RealVector W(n_);
if (n_ > 0) {
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpy(W.data(), d_W_.ptr, static_cast<size_t>(n_) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
}
return W;
}
/** Eigenvectors (columns). Downloads from device.
* Requires ComputeEigenvectors mode. */
PlainMatrix eigenvectors() const {
sync_info();
eigen_assert(info_ == Success);
eigen_assert(mode_ == ComputeEigenvectors && "eigenvectors() requires ComputeEigenvectors mode");
PlainMatrix V(n_, n_);
if (n_ > 0) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(V.data(), d_A_.ptr,
static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar),
cudaMemcpyDeviceToHost));
}
return V;
}
cudaStream_t stream() const { return stream_; }
private:
cudaStream_t stream_ = nullptr;
cusolverDnHandle_t handle_ = nullptr;
internal::CusolverParams params_;
internal::DeviceBuffer d_A_; // overwritten with eigenvectors by syevd
internal::DeviceBuffer d_W_; // eigenvalues (RealScalar, length n)
internal::DeviceBuffer d_scratch_; // workspace + info
size_t scratch_size_ = 0;
std::vector<char> h_workspace_;
ComputeMode mode_ = ComputeEigenvectors;
Index n_ = 0;
int64_t lda_ = 0;
ComputationInfo info_ = InvalidInput;
int info_word_ = 0;
bool info_synced_ = true;
void init_context() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
ensure_scratch(0);
}
void ensure_scratch(size_t workspace_bytes) {
constexpr size_t kAlign = 16;
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
size_t needed = workspace_bytes + sizeof(int);
if (needed > scratch_size_) {
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
d_scratch_ = internal::DeviceBuffer(needed);
scratch_size_ = needed;
}
}
void* scratch_workspace() const { return d_scratch_.ptr; }
int* scratch_info() const {
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
}
void sync_info() const {
if (!info_synced_) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
const_cast<GpuSelfAdjointEigenSolver*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
const_cast<GpuSelfAdjointEigenSolver*>(this)->info_synced_ = true;
}
}
void factorize() {
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
info_synced_ = false;
info_ = InvalidInput;
d_W_ = internal::DeviceBuffer(static_cast<size_t>(n_) * sizeof(RealScalar));
const cusolverEigMode_t jobz =
(mode_ == ComputeEigenvectors) ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
// Use lower triangle (standard convention).
constexpr cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
size_t dev_ws = 0, host_ws = 0;
EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd_bufferSize(handle_, params_.p, jobz, uplo, static_cast<int64_t>(n_), dtype,
d_A_.ptr, lda_, rtype, d_W_.ptr, dtype, &dev_ws, &host_ws));
ensure_scratch(dev_ws);
h_workspace_.resize(host_ws);
EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd(handle_, params_.p, jobz, uplo, static_cast<int64_t>(n_), dtype, d_A_.ptr,
lda_, rtype, d_W_.ptr, dtype, scratch_workspace(), dev_ws,
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
}
};
} // namespace Eigen
#endif // EIGEN_GPU_EIGENSOLVER_H

308
Eigen/src/GPU/GpuFFT.h Normal file
View File

@@ -0,0 +1,308 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU FFT via cuFFT.
//
// Standalone GPU FFT class with plan caching. Supports 1D and 2D transforms:
// C2C (complex-to-complex), R2C (real-to-complex), C2R (complex-to-real).
//
// Inverse transforms are scaled by 1/n (1D) or 1/(n*m) (2D) so that
// inv(fwd(x)) == x, matching Eigen's FFT convention.
//
// cuFFT plans are cached by (size, type) and reused across calls.
//
// Usage:
// GpuFFT<float> fft;
// VectorXcf X = fft.fwd(x); // 1D C2C or R2C
// VectorXcf y = fft.inv(X); // 1D C2C inverse
// VectorXf r = fft.invReal(X, n); // 1D C2R inverse
// MatrixXcf B = fft.fwd2d(A); // 2D C2C forward
// MatrixXcf C = fft.inv2d(B); // 2D C2C inverse
#ifndef EIGEN_GPU_FFT_H
#define EIGEN_GPU_FFT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuFftSupport.h"
#include "./CuBlasSupport.h"
#include <map>
namespace Eigen {
template <typename Scalar_>
class GpuFFT {
public:
using Scalar = Scalar_;
using Complex = std::complex<Scalar>;
using ComplexVector = Matrix<Complex, Dynamic, 1>;
using RealVector = Matrix<Scalar, Dynamic, 1>;
using ComplexMatrix = Matrix<Complex, Dynamic, Dynamic, ColMajor>;
GpuFFT() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
}
~GpuFFT() {
for (auto& kv : plans_) (void)cufftDestroy(kv.second);
if (cublas_) (void)cublasDestroy(cublas_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
GpuFFT(const GpuFFT&) = delete;
GpuFFT& operator=(const GpuFFT&) = delete;
// ---- 1D Complex-to-Complex ------------------------------------------------
/** Forward 1D C2C FFT. */
template <typename Derived>
ComplexVector fwd(const MatrixBase<Derived>& x,
typename std::enable_if<NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
const ComplexVector input(x.derived());
const int n = static_cast<int>(input.size());
if (n == 0) return ComplexVector(0);
ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
ComplexVector result(n);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
/** Inverse 1D C2C FFT. Scaled by 1/n. */
template <typename Derived>
ComplexVector inv(const MatrixBase<Derived>& X) {
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv() requires complex input");
const ComplexVector input(X.derived());
const int n = static_cast<int>(input.size());
if (n == 0) return ComplexVector(0);
ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
// Scale by 1/n.
scale_device(static_cast<Complex*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
ComplexVector result(n);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
// ---- 1D Real-to-Complex ---------------------------------------------------
/** Forward 1D R2C FFT. Returns n/2+1 complex values (half-spectrum). */
template <typename Derived>
ComplexVector fwd(const MatrixBase<Derived>& x,
typename std::enable_if<!NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
const RealVector input(x.derived());
const int n = static_cast<int>(input.size());
if (n == 0) return ComplexVector(0);
const int n_complex = n / 2 + 1;
ensure_buffers(n * sizeof(Scalar), n_complex * sizeof(Complex));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_1d(n, internal::cufft_r2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(
internal::cufftExecR2C_dispatch(plan, static_cast<Scalar*>(d_in_.ptr), static_cast<Complex*>(d_out_.ptr)));
ComplexVector result(n_complex);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(result.data(), d_out_.ptr, n_complex * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
// ---- 1D Complex-to-Real ---------------------------------------------------
/** Inverse 1D C2R FFT. Input is n/2+1 complex values, output is nfft real values.
* Scaled by 1/nfft. Caller must specify nfft (original real signal length). */
template <typename Derived>
RealVector invReal(const MatrixBase<Derived>& X, Index nfft) {
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "invReal() requires complex input");
const ComplexVector input(X.derived());
const int n = static_cast<int>(nfft);
const int n_complex = n / 2 + 1;
eigen_assert(input.size() == n_complex);
if (n == 0) return RealVector(0);
ensure_buffers(n_complex * sizeof(Complex), n * sizeof(Scalar));
// cuFFT C2R may overwrite the input, so we copy to d_in_.
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_in_.ptr, input.data(), n_complex * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_1d(n, internal::cufft_c2r_type<Scalar>::value);
EIGEN_CUFFT_CHECK(
internal::cufftExecC2R_dispatch(plan, static_cast<Complex*>(d_in_.ptr), static_cast<Scalar*>(d_out_.ptr)));
// Scale by 1/n.
scale_device_real(static_cast<Scalar*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
RealVector result(n);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
// ---- 2D Complex-to-Complex ------------------------------------------------
/** Forward 2D C2C FFT. Input and output are rows x cols complex matrices. */
template <typename Derived>
ComplexMatrix fwd2d(const MatrixBase<Derived>& A) {
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "fwd2d() requires complex input");
const ComplexMatrix input(A.derived());
const int rows = static_cast<int>(input.rows());
const int cols = static_cast<int>(input.cols());
if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
ensure_buffers(total, total);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
ComplexMatrix result(rows, cols);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
/** Inverse 2D C2C FFT. Scaled by 1/(rows*cols). */
template <typename Derived>
ComplexMatrix inv2d(const MatrixBase<Derived>& A) {
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv2d() requires complex input");
const ComplexMatrix input(A.derived());
const int rows = static_cast<int>(input.rows());
const int cols = static_cast<int>(input.cols());
if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
ensure_buffers(total, total);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
// Scale by 1/(rows*cols).
const int total_elems = rows * cols;
scale_device(static_cast<Complex*>(d_out_.ptr), total_elems, Scalar(1) / Scalar(total_elems));
ComplexMatrix result(rows, cols);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
// ---- Accessors ------------------------------------------------------------
cudaStream_t stream() const { return stream_; }
private:
cudaStream_t stream_ = nullptr;
cublasHandle_t cublas_ = nullptr;
std::map<int64_t, cufftHandle> plans_;
internal::DeviceBuffer d_in_;
internal::DeviceBuffer d_out_;
size_t d_in_size_ = 0;
size_t d_out_size_ = 0;
void ensure_buffers(size_t in_bytes, size_t out_bytes) {
if (in_bytes > d_in_size_) {
if (d_in_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
d_in_ = internal::DeviceBuffer(in_bytes);
d_in_size_ = in_bytes;
}
if (out_bytes > d_out_size_) {
if (d_out_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
d_out_ = internal::DeviceBuffer(out_bytes);
d_out_size_ = out_bytes;
}
}
// Plan key encoding: rank (1 bit) | type (4 bits) | dims
static int64_t plan_key_1d(int n, cufftType type) { return (int64_t(n) << 5) | (int64_t(type) << 1) | 0; }
static int64_t plan_key_2d(int rows, int cols, cufftType type) {
return (int64_t(rows) << 35) | (int64_t(cols) << 5) | (int64_t(type) << 1) | 1;
}
cufftHandle get_plan_1d(int n, cufftType type) {
int64_t key = plan_key_1d(n, type);
auto it = plans_.find(key);
if (it != plans_.end()) return it->second;
cufftHandle plan;
EIGEN_CUFFT_CHECK(cufftPlan1d(&plan, n, type, /*batch=*/1));
EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
plans_[key] = plan;
return plan;
}
cufftHandle get_plan_2d(int rows, int cols, cufftType type) {
int64_t key = plan_key_2d(rows, cols, type);
auto it = plans_.find(key);
if (it != plans_.end()) return it->second;
// cuFFT uses row-major (C order) for 2D: first dim = rows, second = cols.
// Eigen matrices are column-major, so we pass (cols, rows) to cuFFT
// to get the correct 2D transform.
cufftHandle plan;
EIGEN_CUFFT_CHECK(cufftPlan2d(&plan, cols, rows, type));
EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
plans_[key] = plan;
return plan;
}
// Scale complex array on device using cuBLAS scal.
void scale_device(Complex* d_ptr, int n, Scalar alpha) { scale_complex(cublas_, d_ptr, n, alpha); }
// Scale real array on device using cuBLAS scal.
void scale_device_real(Scalar* d_ptr, int n, Scalar alpha) { scale_real(cublas_, d_ptr, n, alpha); }
// Type-dispatched cuBLAS scal wrappers (C++14 compatible).
static void scale_complex(cublasHandle_t h, std::complex<float>* p, int n, float a) {
EIGEN_CUBLAS_CHECK(cublasCsscal(h, n, &a, reinterpret_cast<cuComplex*>(p), 1));
}
static void scale_complex(cublasHandle_t h, std::complex<double>* p, int n, double a) {
EIGEN_CUBLAS_CHECK(cublasZdscal(h, n, &a, reinterpret_cast<cuDoubleComplex*>(p), 1));
}
static void scale_real(cublasHandle_t h, float* p, int n, float a) {
EIGEN_CUBLAS_CHECK(cublasSscal(h, n, &a, p, 1));
}
static void scale_real(cublasHandle_t h, double* p, int n, double a) {
EIGEN_CUBLAS_CHECK(cublasDscal(h, n, &a, p, 1));
}
};
} // namespace Eigen
#endif // EIGEN_GPU_FFT_H

View File

@@ -149,7 +149,7 @@ class GpuLLT {
// Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions). // Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions).
const PlainMatrix mat(A.derived()); const PlainMatrix mat(A.derived());
lda_ = static_cast<int64_t>(mat.outerStride()); lda_ = static_cast<int64_t>(mat.rows());
allocate_factor_storage(); allocate_factor_storage();
EIGEN_CUDA_RUNTIME_CHECK( EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_factor_.ptr, mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_)); cudaMemcpyAsync(d_factor_.ptr, mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_));
@@ -163,7 +163,7 @@ class GpuLLT {
eigen_assert(d_A.rows() == d_A.cols()); eigen_assert(d_A.rows() == d_A.cols());
if (!begin_compute(d_A.rows())) return *this; if (!begin_compute(d_A.rows())) return *this;
lda_ = static_cast<int64_t>(d_A.outerStride()); lda_ = static_cast<int64_t>(d_A.rows());
d_A.waitReady(stream_); d_A.waitReady(stream_);
allocate_factor_storage(); allocate_factor_storage();
EIGEN_CUDA_RUNTIME_CHECK( EIGEN_CUDA_RUNTIME_CHECK(
@@ -178,7 +178,7 @@ class GpuLLT {
eigen_assert(d_A.rows() == d_A.cols()); eigen_assert(d_A.rows() == d_A.cols());
if (!begin_compute(d_A.rows())) return *this; if (!begin_compute(d_A.rows())) return *this;
lda_ = static_cast<int64_t>(d_A.outerStride()); lda_ = static_cast<int64_t>(d_A.rows());
d_A.waitReady(stream_); d_A.waitReady(stream_);
d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release())); d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
@@ -205,7 +205,7 @@ class GpuLLT {
const PlainMatrix rhs(B); const PlainMatrix rhs(B);
const int64_t nrhs = static_cast<int64_t>(rhs.cols()); const int64_t nrhs = static_cast<int64_t>(rhs.cols());
const int64_t ldb = static_cast<int64_t>(rhs.outerStride()); const int64_t ldb = static_cast<int64_t>(rhs.rows());
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) { DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
EIGEN_CUDA_RUNTIME_CHECK( EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x_ptr, rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_)); cudaMemcpyAsync(d_x_ptr, rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
@@ -234,7 +234,7 @@ class GpuLLT {
eigen_assert(d_B.rows() == n_); eigen_assert(d_B.rows() == n_);
d_B.waitReady(stream_); d_B.waitReady(stream_);
const int64_t nrhs = static_cast<int64_t>(d_B.cols()); const int64_t nrhs = static_cast<int64_t>(d_B.cols());
const int64_t ldb = static_cast<int64_t>(d_B.outerStride()); const int64_t ldb = static_cast<int64_t>(d_B.rows());
return solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) { return solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
EIGEN_CUDA_RUNTIME_CHECK( EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x_ptr, d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_)); cudaMemcpyAsync(d_x_ptr, d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
@@ -332,7 +332,7 @@ class GpuLLT {
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, static_cast<int64_t>(n_), nrhs, dtype, EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, static_cast<int64_t>(n_), nrhs, dtype,
d_factor_.ptr, lda_, dtype, d_x_ptr, ldb, scratch_info())); d_factor_.ptr, lda_, dtype, d_x_ptr, ldb, scratch_info()));
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs), static_cast<Index>(ldb)); DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs));
result.recordReady(stream_); result.recordReady(stream_);
return result; return result;
} }

View File

@@ -140,7 +140,7 @@ class GpuLU {
if (!begin_compute(A.rows())) return *this; if (!begin_compute(A.rows())) return *this;
const PlainMatrix mat(A.derived()); const PlainMatrix mat(A.derived());
lda_ = static_cast<int64_t>(mat.outerStride()); lda_ = static_cast<int64_t>(mat.rows());
allocate_lu_storage(); allocate_lu_storage();
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_)); EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_));
@@ -153,7 +153,7 @@ class GpuLU {
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix"); eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
if (!begin_compute(d_A.rows())) return *this; if (!begin_compute(d_A.rows())) return *this;
lda_ = static_cast<int64_t>(d_A.outerStride()); lda_ = static_cast<int64_t>(d_A.rows());
d_A.waitReady(stream_); d_A.waitReady(stream_);
allocate_lu_storage(); allocate_lu_storage();
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_)); EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_));
@@ -167,7 +167,7 @@ class GpuLU {
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix"); eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
if (!begin_compute(d_A.rows())) return *this; if (!begin_compute(d_A.rows())) return *this;
lda_ = static_cast<int64_t>(d_A.outerStride()); lda_ = static_cast<int64_t>(d_A.rows());
d_A.waitReady(stream_); d_A.waitReady(stream_);
d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release())); d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
@@ -190,7 +190,7 @@ class GpuLU {
const PlainMatrix rhs(B); const PlainMatrix rhs(B);
const int64_t nrhs = static_cast<int64_t>(rhs.cols()); const int64_t nrhs = static_cast<int64_t>(rhs.cols());
const int64_t ldb = static_cast<int64_t>(rhs.outerStride()); const int64_t ldb = static_cast<int64_t>(rhs.rows());
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) { DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
EIGEN_CUDA_RUNTIME_CHECK( EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x_ptr, rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_)); cudaMemcpyAsync(d_x_ptr, rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
@@ -213,7 +213,7 @@ class GpuLU {
eigen_assert(d_B.rows() == n_); eigen_assert(d_B.rows() == n_);
d_B.waitReady(stream_); d_B.waitReady(stream_);
const int64_t nrhs = static_cast<int64_t>(d_B.cols()); const int64_t nrhs = static_cast<int64_t>(d_B.cols());
const int64_t ldb = static_cast<int64_t>(d_B.outerStride()); const int64_t ldb = static_cast<int64_t>(d_B.rows());
return solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) { return solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
EIGEN_CUDA_RUNTIME_CHECK( EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x_ptr, d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_)); cudaMemcpyAsync(d_x_ptr, d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
@@ -305,7 +305,7 @@ class GpuLU {
lda_, static_cast<const int64_t*>(d_ipiv_.ptr), dtype, d_x_ptr, ldb, lda_, static_cast<const int64_t*>(d_ipiv_.ptr), dtype, d_x_ptr, ldb,
scratch_info())); scratch_info()));
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs), static_cast<Index>(ldb)); DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs));
result.recordReady(stream_); result.recordReady(stream_);
return result; return result;
} }

389
Eigen/src/GPU/GpuQR.h Normal file
View File

@@ -0,0 +1,389 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU QR decomposition using cuSOLVER.
//
// Wraps cusolverDnXgeqrf (factorization), cusolverDnXormqr (apply Q),
// cusolverDnXorgqr (form Q), and cublasXtrsm (triangular solve on R).
//
// The factored matrix (reflectors + R) and tau stay in device memory.
// Solve uses ormqr + trsm without forming Q explicitly.
//
// Usage:
// GpuQR<double> qr(A); // upload A, geqrf
// if (qr.info() != Success) { ... }
// MatrixXd X = qr.solve(B); // Q^H * B via ormqr, then trsm on R
//
// Expression syntax:
// d_X = d_A.qr().solve(d_B); // temporary, no caching
#ifndef EIGEN_GPU_QR_H
#define EIGEN_GPU_QR_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuSolverSupport.h"
#include "./CuBlasSupport.h"
#include <vector>
namespace Eigen {
template <typename Scalar_>
class GpuQR {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
GpuQR() { init_context(); }
template <typename InputType>
explicit GpuQR(const EigenBase<InputType>& A) {
init_context();
compute(A);
}
~GpuQR() {
if (handle_) (void)cusolverDnDestroy(handle_);
if (cublas_) (void)cublasDestroy(cublas_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
GpuQR(const GpuQR&) = delete;
GpuQR& operator=(const GpuQR&) = delete;
GpuQR(GpuQR&& o) noexcept
: stream_(o.stream_),
handle_(o.handle_),
cublas_(o.cublas_),
params_(std::move(o.params_)),
d_qr_(std::move(o.d_qr_)),
d_tau_(std::move(o.d_tau_)),
d_scratch_(std::move(o.d_scratch_)),
scratch_size_(o.scratch_size_),
h_workspace_(std::move(o.h_workspace_)),
m_(o.m_),
n_(o.n_),
lda_(o.lda_),
info_(o.info_),
info_word_(o.info_word_),
info_synced_(o.info_synced_) {
o.stream_ = nullptr;
o.handle_ = nullptr;
o.cublas_ = nullptr;
o.scratch_size_ = 0;
o.m_ = 0;
o.n_ = 0;
o.lda_ = 0;
o.info_ = InvalidInput;
o.info_word_ = 0;
o.info_synced_ = true;
}
GpuQR& operator=(GpuQR&& o) noexcept {
if (this != &o) {
if (handle_) (void)cusolverDnDestroy(handle_);
if (cublas_) (void)cublasDestroy(cublas_);
if (stream_) (void)cudaStreamDestroy(stream_);
stream_ = o.stream_;
handle_ = o.handle_;
cublas_ = o.cublas_;
params_ = std::move(o.params_);
d_qr_ = std::move(o.d_qr_);
d_tau_ = std::move(o.d_tau_);
d_scratch_ = std::move(o.d_scratch_);
scratch_size_ = o.scratch_size_;
h_workspace_ = std::move(o.h_workspace_);
m_ = o.m_;
n_ = o.n_;
lda_ = o.lda_;
info_ = o.info_;
info_word_ = o.info_word_;
info_synced_ = o.info_synced_;
o.stream_ = nullptr;
o.handle_ = nullptr;
o.cublas_ = nullptr;
o.scratch_size_ = 0;
o.m_ = 0;
o.n_ = 0;
o.lda_ = 0;
o.info_ = InvalidInput;
o.info_word_ = 0;
o.info_synced_ = true;
}
return *this;
}
// ---- Factorization -------------------------------------------------------
template <typename InputType>
GpuQR& compute(const EigenBase<InputType>& A) {
m_ = A.rows();
n_ = A.cols();
info_ = InvalidInput;
info_synced_ = false;
if (m_ == 0 || n_ == 0) {
info_ = Success;
info_synced_ = true;
return *this;
}
const PlainMatrix mat(A.derived());
lda_ = static_cast<int64_t>(mat.rows());
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
const size_t tau_bytes = static_cast<size_t>((std::min)(m_, n_)) * sizeof(Scalar);
d_qr_ = internal::DeviceBuffer(mat_bytes);
d_tau_ = internal::DeviceBuffer(tau_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_qr_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
factorize();
return *this;
}
GpuQR& compute(const DeviceMatrix<Scalar>& d_A) {
m_ = d_A.rows();
n_ = d_A.cols();
info_ = InvalidInput;
info_synced_ = false;
if (m_ == 0 || n_ == 0) {
info_ = Success;
info_synced_ = true;
return *this;
}
lda_ = static_cast<int64_t>(d_A.rows());
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
const size_t tau_bytes = static_cast<size_t>((std::min)(m_, n_)) * sizeof(Scalar);
d_A.waitReady(stream_);
d_qr_ = internal::DeviceBuffer(mat_bytes);
d_tau_ = internal::DeviceBuffer(tau_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_qr_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
factorize();
return *this;
}
// ---- Solve ---------------------------------------------------------------
/** Solve A * X = B via QR: X = R^{-1} * Q^H * B (least-squares for m >= n).
* Uses ormqr (apply Q^H) + trsm (solve R), without forming Q explicitly.
* Requires m >= n (overdetermined or square). Underdetermined not supported.
*
* TODO: Add device-side accessor for the R factor (and Q application) as
* DeviceMatrix, so users can chain GPU operations without host round-trips. */
template <typename Rhs>
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
sync_info();
eigen_assert(info_ == Success && "GpuQR::solve called on a failed or uninitialized factorization");
eigen_assert(B.rows() == m_);
eigen_assert(m_ >= n_ && "GpuQR::solve requires m >= n (use SVD for underdetermined systems)");
const PlainMatrix rhs(B);
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
const int64_t ldb = static_cast<int64_t>(rhs.rows()); // = m_
const size_t b_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
// Upload B to device (m × nrhs buffer).
internal::DeviceBuffer d_B(b_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.ptr, rhs.data(), b_bytes, cudaMemcpyHostToDevice, stream_));
// Apply Q^H to B in-place: d_B becomes m × nrhs, first n rows hold Q^H * B relevant part.
apply_QH(d_B.ptr, ldb, nrhs);
// Solve R * X = (Q^H * B)[0:n,:] via trsm on the first n rows.
Scalar alpha(1);
EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT, static_cast<int>(n_), static_cast<int>(nrhs), &alpha,
static_cast<const Scalar*>(d_qr_.ptr), static_cast<int>(lda_),
static_cast<Scalar*>(d_B.ptr), static_cast<int>(ldb)));
// Download the first n rows of each column (stride = ldb = m, width = n).
PlainMatrix X(n_, rhs.cols());
if (m_ == n_) {
// Square: dense copy, no stride mismatch.
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_B.ptr,
static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar),
cudaMemcpyDeviceToHost, stream_));
} else {
// Overdetermined: 2D copy to extract first n rows from each column.
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(
X.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_B.ptr, static_cast<size_t>(ldb) * sizeof(Scalar),
static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs), cudaMemcpyDeviceToHost, stream_));
}
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return X;
}
/** Solve with device-resident RHS. Returns n × nrhs DeviceMatrix. */
DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
sync_info();
eigen_assert(info_ == Success && "GpuQR::solve called on a failed or uninitialized factorization");
eigen_assert(d_B.rows() == m_);
eigen_assert(m_ >= n_ && "GpuQR::solve requires m >= n (use SVD for underdetermined systems)");
d_B.waitReady(stream_);
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
const int64_t ldb = static_cast<int64_t>(d_B.rows()); // = m_
const size_t b_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
// D2D copy B into working buffer (ormqr and trsm are in-place).
internal::DeviceBuffer d_work(b_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_work.ptr, d_B.data(), b_bytes, cudaMemcpyDeviceToDevice, stream_));
apply_QH(d_work.ptr, ldb, nrhs);
// trsm on the first n rows.
Scalar alpha(1);
EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT, static_cast<int>(n_), static_cast<int>(nrhs), &alpha,
static_cast<const Scalar*>(d_qr_.ptr), static_cast<int>(lda_),
static_cast<Scalar*>(d_work.ptr), static_cast<int>(ldb)));
if (m_ == n_) {
// Square: result is the whole buffer, dense.
DeviceMatrix<Scalar> result(static_cast<Scalar*>(d_work.ptr), n_, static_cast<Index>(nrhs));
d_work.ptr = nullptr; // transfer ownership
result.recordReady(stream_);
return result;
} else {
// Overdetermined: copy first n rows of each column into a dense n × nrhs result.
DeviceMatrix<Scalar> result(n_, static_cast<Index>(nrhs));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(result.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_work.ptr,
static_cast<size_t>(ldb) * sizeof(Scalar),
static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs),
cudaMemcpyDeviceToDevice, stream_));
result.recordReady(stream_);
return result;
// d_work freed here via RAII — safe because stream is ordered.
}
}
// ---- Accessors -----------------------------------------------------------
ComputationInfo info() const {
sync_info();
return info_;
}
Index rows() const { return m_; }
Index cols() const { return n_; }
cudaStream_t stream() const { return stream_; }
private:
cudaStream_t stream_ = nullptr;
cusolverDnHandle_t handle_ = nullptr;
cublasHandle_t cublas_ = nullptr;
internal::CusolverParams params_;
internal::DeviceBuffer d_qr_; // QR factors (reflectors in lower, R in upper)
internal::DeviceBuffer d_tau_; // Householder scalars (min(m,n))
internal::DeviceBuffer d_scratch_; // workspace + info word
size_t scratch_size_ = 0;
std::vector<char> h_workspace_;
Index m_ = 0;
Index n_ = 0;
int64_t lda_ = 0;
ComputationInfo info_ = InvalidInput;
int info_word_ = 0;
bool info_synced_ = true;
void init_context() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
ensure_scratch(0);
}
void ensure_scratch(size_t workspace_bytes) {
constexpr size_t kAlign = 16;
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
size_t needed = workspace_bytes + sizeof(int);
if (needed > scratch_size_) {
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
d_scratch_ = internal::DeviceBuffer(needed);
scratch_size_ = needed;
}
}
void* scratch_workspace() const { return d_scratch_.ptr; }
int* scratch_info() const {
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
}
void sync_info() const {
if (!info_synced_) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
const_cast<GpuQR*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
const_cast<GpuQR*>(this)->info_synced_ = true;
}
}
void factorize() {
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
info_synced_ = false;
info_ = InvalidInput;
size_t dev_ws = 0, host_ws = 0;
EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf_bufferSize(handle_, params_.p, static_cast<int64_t>(m_),
static_cast<int64_t>(n_), dtype, d_qr_.ptr, lda_, dtype,
d_tau_.ptr, dtype, &dev_ws, &host_ws));
ensure_scratch(dev_ws);
h_workspace_.resize(host_ws);
EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf(handle_, params_.p, static_cast<int64_t>(m_), static_cast<int64_t>(n_), dtype,
d_qr_.ptr, lda_, dtype, d_tau_.ptr, dtype, scratch_workspace(), dev_ws,
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
}
// Apply Q^H to a device buffer in-place: d_B = Q^H * d_B.
// Uses type-specific ormqr (real) or unmqr (complex) wrappers from CuSolverSupport.h.
// For real types: Q^H = Q^T, use CUBLAS_OP_T. For complex: use CUBLAS_OP_C.
void apply_QH(void* d_B, int64_t ldb, int64_t nrhs) const {
const int im = static_cast<int>(m_);
const int in = static_cast<int>(nrhs);
const int ik = static_cast<int>((std::min)(m_, n_));
const int ilda = static_cast<int>(lda_);
const int ildb = static_cast<int>(ldb);
constexpr cublasOperation_t trans = NumTraits<Scalar>::IsComplex ? CUBLAS_OP_C : CUBLAS_OP_T;
int lwork = 0;
EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr_bufferSize(
handle_, CUBLAS_SIDE_LEFT, trans, im, in, ik, static_cast<const Scalar*>(d_qr_.ptr), ilda,
static_cast<const Scalar*>(d_tau_.ptr), static_cast<const Scalar*>(d_B), ildb, &lwork));
internal::DeviceBuffer d_work(static_cast<size_t>(lwork) * sizeof(Scalar));
EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr(handle_, CUBLAS_SIDE_LEFT, trans, im, in, ik,
static_cast<const Scalar*>(d_qr_.ptr), ilda,
static_cast<const Scalar*>(d_tau_.ptr), static_cast<Scalar*>(d_B),
ildb, static_cast<Scalar*>(d_work.ptr), lwork, scratch_info()));
// Sync to ensure workspace can be freed safely, and check ormqr info.
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
int ormqr_info = 0;
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(&ormqr_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost));
eigen_assert(ormqr_info == 0 && "cusolverDnXormqr reported an error");
}
};
} // namespace Eigen
#endif // EIGEN_GPU_QR_H

490
Eigen/src/GPU/GpuSVD.h Normal file
View File

@@ -0,0 +1,490 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU SVD decomposition using cuSOLVER (divide-and-conquer).
//
// Wraps cusolverDnXgesvd. Stores U, S, VT on device. Solve uses
// cuBLAS GEMM: X = VT^H * diag(D) * U^H * B.
//
// cuSOLVER returns VT (not V). We store and expose VT directly.
//
// Usage:
// GpuSVD<double> svd(A, ComputeThinU | ComputeThinV);
// VectorXd S = svd.singularValues();
// MatrixXd U = svd.matrixU(); // m×k or m×m
// MatrixXd VT = svd.matrixVT(); // k×n or n×n (this is V^T)
// MatrixXd X = svd.solve(B); // pseudoinverse
// MatrixXd X = svd.solve(B, k); // truncated (top k triplets)
// MatrixXd X = svd.solve(B, 0.1); // Tikhonov regularized
#ifndef EIGEN_GPU_SVD_H
#define EIGEN_GPU_SVD_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuSolverSupport.h"
#include "./CuBlasSupport.h"
#include <vector>
namespace Eigen {
template <typename Scalar_>
class GpuSVD {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
using RealVector = Matrix<RealScalar, Dynamic, 1>;
GpuSVD() { init_context(); }
template <typename InputType>
explicit GpuSVD(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
init_context();
compute(A, options);
}
~GpuSVD() {
if (handle_) (void)cusolverDnDestroy(handle_);
if (cublas_) (void)cublasDestroy(cublas_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
GpuSVD(const GpuSVD&) = delete;
GpuSVD& operator=(const GpuSVD&) = delete;
// Move constructors omitted for brevity — follow GpuQR pattern.
// ---- Factorization -------------------------------------------------------
template <typename InputType>
GpuSVD& compute(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
options_ = options;
m_ = A.rows();
n_ = A.cols();
info_ = InvalidInput;
info_synced_ = false;
if (m_ == 0 || n_ == 0) {
info_ = Success;
info_synced_ = true;
return *this;
}
// cuSOLVER gesvd requires m >= n. For wide matrices, transpose internally.
transposed_ = (m_ < n_);
const PlainMatrix mat = transposed_ ? PlainMatrix(A.derived().adjoint()) : PlainMatrix(A.derived());
if (transposed_) std::swap(m_, n_);
lda_ = static_cast<int64_t>(mat.rows());
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
// Copy (possibly transposed) A to device (gesvd overwrites it).
d_A_ = internal::DeviceBuffer(mat_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
factorize();
return *this;
}
GpuSVD& compute(const DeviceMatrix<Scalar>& d_A, unsigned int options = ComputeThinU | ComputeThinV) {
options_ = options;
m_ = d_A.rows();
n_ = d_A.cols();
info_ = InvalidInput;
info_synced_ = false;
if (m_ == 0 || n_ == 0) {
info_ = Success;
info_synced_ = true;
return *this;
}
transposed_ = (m_ < n_);
d_A.waitReady(stream_);
if (transposed_) {
// Transpose on device via cuBLAS geam: d_A_ = A^H.
std::swap(m_, n_);
lda_ = m_;
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
d_A_ = internal::DeviceBuffer(mat_bytes);
Scalar alpha_one(1), beta_zero(0);
// geam: C(m×n) = alpha * op(A) + beta * op(B). Use B = nullptr trick: beta=0.
// A is the original d_A (n_orig × m_orig = n × m after swap), transposed → m × n.
EIGEN_CUBLAS_CHECK(internal::cublasXgeam(
cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(m_), static_cast<int>(n_), &alpha_one, d_A.data(),
static_cast<int>(d_A.rows()), &beta_zero, static_cast<const Scalar*>(nullptr), static_cast<int>(m_),
static_cast<Scalar*>(d_A_.ptr), static_cast<int>(m_)));
} else {
lda_ = static_cast<int64_t>(d_A.rows());
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
d_A_ = internal::DeviceBuffer(mat_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
}
factorize();
return *this;
}
// ---- Accessors -----------------------------------------------------------
ComputationInfo info() const {
sync_info();
return info_;
}
Index rows() const { return transposed_ ? n_ : m_; }
Index cols() const { return transposed_ ? m_ : n_; }
// TODO: Add device-side accessors (deviceU(), deviceVT(), deviceSingularValues())
// returning DeviceMatrix views of the internal buffers, so users can chain
// GPU operations without round-tripping through host memory.
/** Singular values (always available). Downloads from device on each call. */
RealVector singularValues() const {
sync_info();
eigen_assert(info_ == Success);
const Index k = (std::min)(m_, n_);
RealVector S(k);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpy(S.data(), d_S_.ptr, static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
return S;
}
/** Left singular vectors U. Returns m_orig × k or m_orig × m_orig.
* For transposed case (m_orig < n_orig), U comes from cuSOLVER's VT. */
PlainMatrix matrixU() const {
sync_info();
eigen_assert(info_ == Success);
eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "matrixU() requires ComputeThinU or ComputeFullU");
const Index m_orig = transposed_ ? n_ : m_;
const Index n_orig = transposed_ ? m_ : n_;
const Index k = (std::min)(m_orig, n_orig);
if (!transposed_) {
const Index ucols = (options_ & ComputeFullU) ? m_ : k;
PlainMatrix U(m_, ucols);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U.data(), d_U_.ptr,
static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
cudaMemcpyDeviceToHost));
return U;
} else {
// Transposed: U_orig = VT_stored^H. VT_stored is vtrows × n_ (= vtrows × m_orig).
const Index vtrows = (options_ & ComputeFullU) ? m_orig : k; // Note: FullU maps to FullV of A^H
PlainMatrix VT_stored(vtrows, n_);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT_stored.data(), d_VT_.ptr,
static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
cudaMemcpyDeviceToHost));
return VT_stored.adjoint(); // m_orig × vtrows
}
}
/** Right singular vectors transposed V^T. Returns k × n_orig or n_orig × n_orig.
* For transposed case, VT comes from cuSOLVER's U. */
PlainMatrix matrixVT() const {
sync_info();
eigen_assert(info_ == Success);
eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "matrixVT() requires ComputeThinV or ComputeFullV");
const Index m_orig = transposed_ ? n_ : m_;
const Index n_orig = transposed_ ? m_ : n_;
const Index k = (std::min)(m_orig, n_orig);
if (!transposed_) {
const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
PlainMatrix VT(vtrows, n_);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT.data(), d_VT_.ptr,
static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
cudaMemcpyDeviceToHost));
return VT;
} else {
// Transposed: VT_orig = U_stored^H. U_stored is m_ × ucols (= n_orig × ucols).
const Index ucols = (options_ & ComputeFullV) ? n_orig : k; // FullV maps to FullU of A^H
PlainMatrix U_stored(m_, ucols);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U_stored.data(), d_U_.ptr,
static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
cudaMemcpyDeviceToHost));
return U_stored.adjoint(); // ucols × n_orig
}
}
/** Number of singular values above threshold. */
Index rank(RealScalar threshold = RealScalar(-1)) const {
RealVector S = singularValues();
if (S.size() == 0) return 0;
if (threshold < 0) {
threshold = (std::max)(m_, n_) * S(0) * NumTraits<RealScalar>::epsilon();
}
Index r = 0;
for (Index i = 0; i < S.size(); ++i) {
if (S(i) > threshold) ++r;
}
return r;
}
// ---- Solve ---------------------------------------------------------------
/** Pseudoinverse solve: X = V * diag(1/S) * U^H * B. */
template <typename Rhs>
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
return solve_impl(B, (std::min)(m_, n_), RealScalar(0));
}
/** Truncated solve: use only top trunc singular triplets. */
template <typename Rhs>
PlainMatrix solve(const MatrixBase<Rhs>& B, Index trunc) const {
eigen_assert(trunc > 0 && trunc <= (std::min)(m_, n_));
return solve_impl(B, trunc, RealScalar(0));
}
/** Tikhonov-regularized solve: D_ii = S_i / (S_i^2 + lambda^2). */
template <typename Rhs>
PlainMatrix solve(const MatrixBase<Rhs>& B, RealScalar lambda) const {
eigen_assert(lambda > 0);
return solve_impl(B, (std::min)(m_, n_), lambda);
}
cudaStream_t stream() const { return stream_; }
private:
cudaStream_t stream_ = nullptr;
cusolverDnHandle_t handle_ = nullptr;
cublasHandle_t cublas_ = nullptr;
internal::CusolverParams params_;
internal::DeviceBuffer d_A_; // working copy of A (overwritten by gesvd)
internal::DeviceBuffer d_U_; // left singular vectors
internal::DeviceBuffer d_S_; // singular values (RealScalar)
internal::DeviceBuffer d_VT_; // right singular vectors transposed
internal::DeviceBuffer d_scratch_; // workspace + info
size_t scratch_size_ = 0;
std::vector<char> h_workspace_;
unsigned int options_ = 0;
Index m_ = 0;
Index n_ = 0;
int64_t lda_ = 0;
bool transposed_ = false; // true if m < n (we compute SVD of A^T internally)
ComputationInfo info_ = InvalidInput;
int info_word_ = 0;
bool info_synced_ = true;
void init_context() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
ensure_scratch(0);
}
void ensure_scratch(size_t workspace_bytes) {
constexpr size_t kAlign = 16;
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
size_t needed = workspace_bytes + sizeof(int);
if (needed > scratch_size_) {
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
d_scratch_ = internal::DeviceBuffer(needed);
scratch_size_ = needed;
}
}
void* scratch_workspace() const { return d_scratch_.ptr; }
int* scratch_info() const {
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
}
void sync_info() const {
if (!info_synced_) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
const_cast<GpuSVD*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
const_cast<GpuSVD*>(this)->info_synced_ = true;
}
}
// Swap U↔V flags for the transposed case.
static unsigned int swap_uv_options(unsigned int opts) {
unsigned int result = 0;
if (opts & ComputeThinU) result |= ComputeThinV;
if (opts & ComputeFullU) result |= ComputeFullV;
if (opts & ComputeThinV) result |= ComputeThinU;
if (opts & ComputeFullV) result |= ComputeFullU;
return result;
}
static signed char jobu(unsigned int opts) {
if (opts & ComputeFullU) return 'A';
if (opts & ComputeThinU) return 'S';
return 'N';
}
static signed char jobvt(unsigned int opts) {
if (opts & ComputeFullV) return 'A';
if (opts & ComputeThinV) return 'S';
return 'N';
}
void factorize() {
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
const Index k = (std::min)(m_, n_);
info_synced_ = false;
info_ = InvalidInput;
// Allocate output buffers. When transposed, swap U/V roles for cuSOLVER.
d_S_ = internal::DeviceBuffer(static_cast<size_t>(k) * sizeof(RealScalar));
// Internal options: for transposed case, what user wants as U we compute as VT of A^H.
const unsigned int int_opts = transposed_ ? swap_uv_options(options_) : options_;
const Index ucols = (int_opts & ComputeFullU) ? m_ : ((int_opts & ComputeThinU) ? k : 0);
const Index vtrows = (int_opts & ComputeFullV) ? n_ : ((int_opts & ComputeThinV) ? k : 0);
const int64_t ldu = m_;
const int64_t ldvt = vtrows > 0 ? vtrows : 1;
if (ucols > 0) d_U_ = internal::DeviceBuffer(static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar));
if (vtrows > 0)
d_VT_ = internal::DeviceBuffer(static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar));
// computeType must match the matrix data type (dtype), not the singular value type (rtype).
eigen_assert(m_ >= n_ && "Internal error: m_ < n_ should have been handled by transpose in compute()");
size_t dev_ws = 0, host_ws = 0;
EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd_bufferSize(
handle_, params_.p, jobu(int_opts), jobvt(int_opts), static_cast<int64_t>(m_), static_cast<int64_t>(n_), dtype,
d_A_.ptr, lda_, rtype, d_S_.ptr, dtype, ucols > 0 ? d_U_.ptr : nullptr, ldu, dtype,
vtrows > 0 ? d_VT_.ptr : nullptr, ldvt, dtype, &dev_ws, &host_ws));
ensure_scratch(dev_ws);
h_workspace_.resize(host_ws);
// Compute SVD.
EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd(handle_, params_.p, jobu(int_opts), jobvt(int_opts), static_cast<int64_t>(m_),
static_cast<int64_t>(n_), dtype, d_A_.ptr, lda_, rtype, d_S_.ptr, dtype,
ucols > 0 ? d_U_.ptr : nullptr, ldu, dtype, vtrows > 0 ? d_VT_.ptr : nullptr,
ldvt, dtype, scratch_workspace(), dev_ws,
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
}
// Internal solve: X = V * diag(D) * U^H * B, using top `trunc` triplets.
// D_ii = 1/S_i (if lambda==0) or S_i/(S_i^2+lambda^2).
//
// For non-transposed: stored U, VT. X = VT^H * D * U^H * B.
// For transposed (SVD of A^H): stored U', VT'. X = U' * D * VT' * B.
template <typename Rhs>
PlainMatrix solve_impl(const MatrixBase<Rhs>& B, Index trunc, RealScalar lambda) const {
sync_info();
eigen_assert(info_ == Success && "GpuSVD::solve called on a failed or uninitialized decomposition");
eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "solve requires U");
eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "solve requires V");
const Index m_orig = transposed_ ? n_ : m_;
const Index n_orig = transposed_ ? m_ : n_;
eigen_assert(B.rows() == m_orig);
const Index k = (std::min)(m_, n_); // = min(m_orig, n_orig)
const Index kk = (std::min)(trunc, k);
const Index nrhs = B.cols();
// Download S to host to build the diagonal scaling.
RealVector S(k);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpy(S.data(), d_S_.ptr, static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
// Upload B (m_orig × nrhs).
const PlainMatrix rhs(B);
internal::DeviceBuffer d_B(static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.ptr, rhs.data(),
static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
cudaMemcpyHostToDevice, stream_));
// Step 1: tmp = U_orig^H * B (kk × nrhs).
// Non-transposed: U_stored is m_×ucols, U_orig = U_stored. Use U_stored^H * B.
// Transposed: U_orig = VT_stored^H, so U_orig^H = VT_stored. Use VT_stored * B (no transpose!).
internal::DeviceBuffer d_tmp(static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar));
{
Scalar alpha_one(1), beta_zero(0);
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;
if (!transposed_) {
// U_stored^H * B: (m_×kk)^H × (m_×nrhs) → kk×nrhs.
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs),
static_cast<int>(m_), &alpha_one, d_U_.ptr, dtype, static_cast<int>(m_),
d_B.ptr, dtype, static_cast<int>(m_orig), &beta_zero, d_tmp.ptr, dtype,
static_cast<int>(kk), compute, internal::cuda_gemm_algo()));
} else {
// VT_stored * B: VT_stored is vtrows×n_ = kk×m_orig (thin), NoTrans.
// vtrows×m_orig times m_orig×nrhs → vtrows×nrhs. Use first kk rows.
const Index vtrows_stored = (swap_uv_options(options_) & ComputeFullV) ? n_ : k;
EIGEN_CUBLAS_CHECK(cublasGemmEx(
cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs), static_cast<int>(m_orig),
&alpha_one, d_VT_.ptr, dtype, static_cast<int>(vtrows_stored), d_B.ptr, dtype, static_cast<int>(m_orig),
&beta_zero, d_tmp.ptr, dtype, static_cast<int>(kk), compute, internal::cuda_gemm_algo()));
}
}
// Step 2: Scale row i of tmp by D_ii.
// Download tmp to host, scale, re-upload. (Simple and correct; a device kernel would be faster.)
{
PlainMatrix tmp(kk, nrhs);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(tmp.data(), d_tmp.ptr,
static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar),
cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
for (Index i = 0; i < kk; ++i) {
RealScalar si = S(i);
RealScalar di = (lambda == RealScalar(0)) ? (si > 0 ? RealScalar(1) / si : RealScalar(0))
: si / (si * si + lambda * lambda);
tmp.row(i) *= Scalar(di);
}
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_tmp.ptr, tmp.data(),
static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar),
cudaMemcpyHostToDevice, stream_));
}
// Step 3: X = V_orig * tmp (n_orig × nrhs).
// Non-transposed: V_orig = VT_stored^H. VT_stored[:kk,:]^H * tmp → n_orig × nrhs.
// Transposed: V_orig = U_stored[:,:kk]. U_stored * tmp → n_orig × nrhs (NoTrans).
PlainMatrix X(n_orig, nrhs);
{
internal::DeviceBuffer d_X(static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
Scalar alpha_one(1), beta_zero(0);
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;
if (!transposed_) {
const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(n_orig),
static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_VT_.ptr, dtype,
static_cast<int>(vtrows), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero,
d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo()));
} else {
// U_stored is m_×ucols. V_orig = U_stored[:,:kk]. NoTrans × tmp.
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(n_orig),
static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_U_.ptr, dtype,
static_cast<int>(m_), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero,
d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo()));
}
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_X.ptr,
static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
}
return X;
}
};
} // namespace Eigen
#endif // EIGEN_GPU_SVD_H

View File

@@ -0,0 +1,321 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix
// multiply (SpMM) via cuSPARSE.
//
// GpuSparseContext manages a cuSPARSE handle and device buffers. It accepts
// Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the
// GPU. RowMajor input is implicitly converted to ColMajor.
//
// Usage:
// GpuSparseContext<double> ctx;
// VectorXd y = ctx.multiply(A, x); // y = A * x
// ctx.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y
// VectorXd z = ctx.multiplyT(A, x); // z = A^T * x
// MatrixXd Y = ctx.multiplyMat(A, X); // Y = A * X (multiple RHS)
#ifndef EIGEN_GPU_SPARSE_CONTEXT_H
#define EIGEN_GPU_SPARSE_CONTEXT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuSparseSupport.h"
namespace Eigen {
template <typename Scalar_>
class GpuSparseContext {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using StorageIndex = int;
using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
using DenseVector = Matrix<Scalar, Dynamic, 1>;
using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
GpuSparseContext() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_));
EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_));
}
~GpuSparseContext() {
destroy_descriptors();
if (handle_) (void)cusparseDestroy(handle_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
GpuSparseContext(const GpuSparseContext&) = delete;
GpuSparseContext& operator=(const GpuSparseContext&) = delete;
// ---- SpMV: y = A * x -----------------------------------------------------
/** Compute y = A * x. Returns y as a new dense vector. */
template <typename InputType, typename Rhs>
DenseVector multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
const SpMat mat(A.derived());
DenseVector y(mat.rows());
y.setZero();
multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
return y;
}
/** Compute y = alpha * op(A) * x + beta * y (in-place). */
template <typename InputType, typename Rhs, typename Dest>
void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y,
Scalar alpha = Scalar(1), Scalar beta = Scalar(0),
cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
const SpMat mat(A.derived());
multiply_impl(mat, x.derived(), y.derived(), alpha, beta, op);
}
// ---- SpMV transpose: y = A^T * x -----------------------------------------
/** Compute y = A^T * x. Returns y as a new dense vector. */
template <typename InputType, typename Rhs>
DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
const SpMat mat(A.derived());
DenseVector y(mat.cols());
y.setZero();
multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
return y;
}
// ---- SpMM: Y = A * X (multiple RHS) --------------------------------------
/** Compute Y = A * X where X is a dense matrix (multiple RHS). Returns Y. */
template <typename InputType, typename Rhs>
DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X) {
const SpMat mat(A.derived());
const DenseMatrix rhs(X.derived());
eigen_assert(mat.cols() == rhs.rows());
const Index m = mat.rows();
const Index n = rhs.cols();
if (m == 0 || n == 0 || mat.nonZeros() == 0) return DenseMatrix::Zero(m, n);
DenseMatrix Y = DenseMatrix::Zero(m, n);
spmm_impl(mat, rhs, Y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
return Y;
}
// ---- Accessors ------------------------------------------------------------
cudaStream_t stream() const { return stream_; }
private:
cudaStream_t stream_ = nullptr;
cusparseHandle_t handle_ = nullptr;
// Cached device buffers (grow-only).
internal::DeviceBuffer d_outerPtr_;
internal::DeviceBuffer d_innerIdx_;
internal::DeviceBuffer d_values_;
internal::DeviceBuffer d_x_;
internal::DeviceBuffer d_y_;
internal::DeviceBuffer d_workspace_;
size_t d_outerPtr_size_ = 0;
size_t d_innerIdx_size_ = 0;
size_t d_values_size_ = 0;
size_t d_x_size_ = 0;
size_t d_y_size_ = 0;
size_t d_workspace_size_ = 0;
// Cached cuSPARSE descriptors.
cusparseSpMatDescr_t spmat_desc_ = nullptr;
Index cached_rows_ = -1;
Index cached_cols_ = -1;
Index cached_nnz_ = -1;
// ---- SpMV implementation --------------------------------------------------
template <typename RhsDerived, typename DestDerived>
void multiply_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
cusparseOperation_t op) {
eigen_assert(A.isCompressed());
const Index m = A.rows();
const Index n = A.cols();
const Index nnz = A.nonZeros();
const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
eigen_assert(x.size() == x_size);
eigen_assert(y.size() == y_size);
if (m == 0 || n == 0 || nnz == 0) {
if (beta == Scalar(0))
y.setZero();
else
y *= beta;
return;
}
// Upload sparse matrix to device.
upload_sparse(A);
// Upload x to device.
ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar));
const DenseVector x_tmp(x);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x_.ptr, x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
// Upload y to device (for beta != 0).
ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar));
if (beta != Scalar(0)) {
const DenseVector y_tmp(y);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_y_.ptr, y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
}
// Create dense vector descriptors.
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr;
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_.ptr, dtype));
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_.ptr, dtype));
// Query workspace size.
size_t ws_size = 0;
EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
CUSPARSE_SPMV_ALG_DEFAULT, &ws_size));
ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
// Execute SpMV.
EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.ptr));
// Download result.
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(y.data(), d_y_.ptr, y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
(void)cusparseDestroyDnVec(x_desc);
(void)cusparseDestroyDnVec(y_desc);
}
// ---- SpMM implementation --------------------------------------------------
void spmm_impl(const SpMat& A, const DenseMatrix& X, DenseMatrix& Y, Scalar alpha, Scalar beta,
cusparseOperation_t op) {
eigen_assert(A.isCompressed());
const Index m = A.rows();
const Index n = X.cols();
const Index k = A.cols();
const Index nnz = A.nonZeros();
if (m == 0 || n == 0 || k == 0 || nnz == 0) {
if (beta == Scalar(0))
Y.setZero();
else
Y *= beta;
return;
}
upload_sparse(A);
// Upload X to device.
const size_t x_bytes = static_cast<size_t>(k) * static_cast<size_t>(n) * sizeof(Scalar);
const size_t y_bytes = static_cast<size_t>(m) * static_cast<size_t>(n) * sizeof(Scalar);
ensure_buffer(d_x_, d_x_size_, x_bytes);
ensure_buffer(d_y_, d_y_size_, y_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_x_.ptr, X.data(), x_bytes, cudaMemcpyHostToDevice, stream_));
if (beta != Scalar(0)) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.ptr, Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_));
}
// Create dense matrix descriptors.
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr;
// Eigen is column-major, so ld = rows.
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k, n, k, d_x_.ptr, dtype, CUSPARSE_ORDER_COL));
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m, n, m, d_y_.ptr, dtype, CUSPARSE_ORDER_COL));
// Query workspace.
size_t ws_size = 0;
EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_,
x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size));
ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
// Execute SpMM.
EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta,
y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.ptr));
// Download result.
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.ptr, y_bytes, cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
(void)cusparseDestroyDnMat(x_desc);
(void)cusparseDestroyDnMat(y_desc);
}
// ---- Helpers --------------------------------------------------------------
void upload_sparse(const SpMat& A) {
const Index m = A.rows();
const Index n = A.cols();
const Index nnz = A.nonZeros();
const size_t outer_bytes = static_cast<size_t>(n + 1) * sizeof(StorageIndex);
const size_t inner_bytes = static_cast<size_t>(nnz) * sizeof(StorageIndex);
const size_t val_bytes = static_cast<size_t>(nnz) * sizeof(Scalar);
ensure_buffer(d_outerPtr_, d_outerPtr_size_, outer_bytes);
ensure_buffer(d_innerIdx_, d_innerIdx_size_, inner_bytes);
ensure_buffer(d_values_, d_values_size_, val_bytes);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_outerPtr_.ptr, A.outerIndexPtr(), outer_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_innerIdx_.ptr, A.innerIndexPtr(), inner_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, A.valuePtr(), val_bytes, cudaMemcpyHostToDevice, stream_));
// Recreate descriptor if shape changed.
if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) {
destroy_descriptors();
constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value;
// ColMajor → CSC. outerIndexPtr = col offsets, innerIndexPtr = row indices.
EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr,
idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO, val_type));
cached_rows_ = m;
cached_cols_ = n;
cached_nnz_ = nnz;
} else {
// Same shape — just update pointers.
EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr));
}
}
void destroy_descriptors() {
if (spmat_desc_) {
(void)cusparseDestroySpMat(spmat_desc_);
spmat_desc_ = nullptr;
}
cached_rows_ = -1;
cached_cols_ = -1;
cached_nnz_ = -1;
}
void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) {
if (needed > current_size) {
if (buf.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
buf = internal::DeviceBuffer(needed);
current_size = needed;
}
}
};
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_CONTEXT_H

View File

@@ -0,0 +1,62 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU sparse LDL^T / LDL^H factorization via cuDSS.
//
// For symmetric indefinite (or Hermitian indefinite) sparse matrices.
// Same three-phase workflow as GpuSparseLLT.
//
// Usage:
// GpuSparseLDLT<double> ldlt(A); // analyze + factorize
// VectorXd x = ldlt.solve(b); // solve
#ifndef EIGEN_GPU_SPARSE_LDLT_H
#define EIGEN_GPU_SPARSE_LDLT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSparseSolverBase.h"
namespace Eigen {
/** GPU sparse LDL^T factorization (symmetric indefinite / Hermitian indefinite).
*
* Wraps cuDSS with CUDSS_MTYPE_SYMMETRIC (real) or CUDSS_MTYPE_HERMITIAN (complex).
* Uses pivoting for numerical stability.
*
* \tparam Scalar_ float, double, complex<float>, or complex<double>
* \tparam UpLo_ Lower (default) or Upper — which triangle of A is stored
*/
template <typename Scalar_, int UpLo_ = Lower>
class GpuSparseLDLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT<Scalar_, UpLo_>> {
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT>;
friend Base;
public:
using Scalar = Scalar_;
enum { UpLo = UpLo_ };
GpuSparseLDLT() = default;
template <typename InputType>
explicit GpuSparseLDLT(const SparseMatrixBase<InputType>& A) {
this->compute(A);
}
static constexpr bool needs_csr_conversion() { return false; }
static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_symmetric_type<Scalar>::value; }
static constexpr cudssMatrixViewType_t cudss_matrix_view() {
return internal::cudss_view_type<UpLo, ColMajor>::value;
}
};
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_LDLT_H

View File

@@ -0,0 +1,62 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU sparse Cholesky (LL^T / LL^H) via cuDSS.
//
// Usage:
// GpuSparseLLT<double> llt(A); // analyze + factorize
// VectorXd x = llt.solve(b); // solve
// llt.analyzePattern(A); // or separate phases
// llt.factorize(A_new); // reuse symbolic analysis
#ifndef EIGEN_GPU_SPARSE_LLT_H
#define EIGEN_GPU_SPARSE_LLT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSparseSolverBase.h"
namespace Eigen {
/** GPU sparse Cholesky factorization (LL^T for real, LL^H for complex).
*
* Wraps cuDSS with CUDSS_MTYPE_SPD (real) or CUDSS_MTYPE_HPD (complex).
* Accepts ColMajor SparseMatrix (CSC), reinterpreted as CSR with swapped
* triangle view for zero-copy upload.
*
* \tparam Scalar_ float, double, complex<float>, or complex<double>
* \tparam UpLo_ Lower (default) or Upper — which triangle of A is stored
*/
template <typename Scalar_, int UpLo_ = Lower>
class GpuSparseLLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT<Scalar_, UpLo_>> {
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT>;
friend Base;
public:
using Scalar = Scalar_;
enum { UpLo = UpLo_ };
GpuSparseLLT() = default;
template <typename InputType>
explicit GpuSparseLLT(const SparseMatrixBase<InputType>& A) {
this->compute(A);
}
static constexpr bool needs_csr_conversion() { return false; }
static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_spd_type<Scalar>::value; }
static constexpr cudssMatrixViewType_t cudss_matrix_view() {
return internal::cudss_view_type<UpLo, ColMajor>::value;
}
};
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_LLT_H

View File

@@ -0,0 +1,59 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU sparse LU factorization via cuDSS.
//
// For general (non-symmetric) sparse matrices. Uses pivoting.
// Same three-phase workflow as GpuSparseLLT.
//
// Usage:
// GpuSparseLU<double> lu(A); // analyze + factorize
// VectorXd x = lu.solve(b); // solve
#ifndef EIGEN_GPU_SPARSE_LU_H
#define EIGEN_GPU_SPARSE_LU_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSparseSolverBase.h"
namespace Eigen {
/** GPU sparse LU factorization (general matrices).
*
* Wraps cuDSS with CUDSS_MTYPE_GENERAL and CUDSS_MVIEW_FULL.
* Accepts ColMajor SparseMatrix (CSC); internally converts to RowMajor
* CSR since cuDSS requires CSR input.
*
* \tparam Scalar_ float, double, complex<float>, or complex<double>
*/
template <typename Scalar_>
class GpuSparseLU : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLU<Scalar_>> {
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLU>;
friend Base;
public:
using Scalar = Scalar_;
GpuSparseLU() = default;
template <typename InputType>
explicit GpuSparseLU(const SparseMatrixBase<InputType>& A) {
this->compute(A);
}
static constexpr bool needs_csr_conversion() { return true; }
static constexpr cudssMatrixType_t cudss_matrix_type() { return CUDSS_MTYPE_GENERAL; }
static constexpr cudssMatrixViewType_t cudss_matrix_view() { return CUDSS_MVIEW_FULL; }
};
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_LU_H

View File

@@ -0,0 +1,356 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Common base for GPU sparse direct solvers (LLT, LDLT, LU) via cuDSS.
//
// All three solver types share the same three-phase workflow
// (analyzePattern → factorize → solve) and differ only in the
// cudssMatrixType_t and cudssMatrixViewType_t passed to cuDSS.
// This CRTP base implements the entire workflow; derived classes
// provide the matrix type/view via static constexpr members.
#ifndef EIGEN_GPU_SPARSE_SOLVER_BASE_H
#define EIGEN_GPU_SPARSE_SOLVER_BASE_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuDssSupport.h"
namespace Eigen {
namespace internal {
/** CRTP base for GPU sparse direct solvers.
*
* \tparam Scalar_ Element type (passed explicitly to avoid incomplete-type issues with CRTP).
* \tparam Derived The concrete solver class (GpuSparseLLT, GpuSparseLDLT, GpuSparseLU).
* Must provide:
* - `static constexpr cudssMatrixType_t cudss_matrix_type()`
* - `static constexpr cudssMatrixViewType_t cudss_matrix_view()`
*/
template <typename Scalar_, typename Derived>
class GpuSparseSolverBase {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using StorageIndex = int;
using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>;
using DenseVector = Matrix<Scalar, Dynamic, 1>;
using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
GpuSparseSolverBase() { init_context(); }
~GpuSparseSolverBase() {
destroy_cudss_objects();
if (handle_) (void)cudssDestroy(handle_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
GpuSparseSolverBase(const GpuSparseSolverBase&) = delete;
GpuSparseSolverBase& operator=(const GpuSparseSolverBase&) = delete;
// ---- Configuration --------------------------------------------------------
/** Set the fill-reducing ordering algorithm. Must be called before compute/analyzePattern. */
void setOrdering(GpuSparseOrdering ordering) { ordering_ = ordering; }
// ---- Factorization --------------------------------------------------------
/** Symbolic analysis + numeric factorization. */
template <typename InputType>
Derived& compute(const SparseMatrixBase<InputType>& A) {
analyzePattern(A);
if (info_ == Success) {
factorize(A);
}
return derived();
}
/** Symbolic analysis only. Uploads sparsity structure to device.
* This phase is synchronous (blocks until complete). */
template <typename InputType>
Derived& analyzePattern(const SparseMatrixBase<InputType>& A) {
const SpMat csc(A.derived());
eigen_assert(csc.rows() == csc.cols() && "GpuSparseSolver requires a square matrix");
eigen_assert(csc.isCompressed() && "GpuSparseSolver requires a compressed sparse matrix");
n_ = csc.rows();
info_ = InvalidInput;
analysis_done_ = false;
if (n_ == 0) {
nnz_ = 0;
info_ = Success;
analysis_done_ = true;
return derived();
}
// For symmetric solvers, ColMajor CSC can be reinterpreted as CSR with
// swapped triangle view (zero copy). For general solvers, we must convert
// to actual RowMajor CSR so cuDSS sees the correct matrix, not A^T.
if (Derived::needs_csr_conversion()) {
const CsrMat csr(csc);
nnz_ = csr.nonZeros();
upload_csr(csr);
} else {
nnz_ = csc.nonZeros();
upload_csr_from_csc(csc);
}
create_cudss_matrix();
apply_ordering_config();
if (data_) EIGEN_CUDSS_CHECK(cudssDataDestroy(handle_, data_));
EIGEN_CUDSS_CHECK(cudssDataCreate(handle_, &data_));
create_placeholder_dense();
EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_ANALYSIS, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
analysis_done_ = true;
info_ = Success;
return derived();
}
/** Numeric factorization using the symbolic analysis from analyzePattern.
*
* \warning The sparsity pattern (outerIndexPtr, innerIndexPtr) must be
* identical to the one passed to analyzePattern(). Only the numerical
* values may change. Passing a different pattern is undefined behavior.
* This matches the contract of CHOLMOD, UMFPACK, and cuDSS's own API.
*
* This phase is asynchronous — info() lazily synchronizes. */
template <typename InputType>
Derived& factorize(const SparseMatrixBase<InputType>& A) {
eigen_assert(analysis_done_ && "factorize() requires analyzePattern() first");
if (n_ == 0) {
info_ = Success;
return derived();
}
// Convert to the same format used in analyzePattern.
// Both temporaries must outlive the async memcpy (pageable H2D is actually
// synchronous w.r.t. the host, but keep them alive for clarity).
const SpMat csc(A.derived());
eigen_assert(csc.rows() == n_ && csc.cols() == n_);
const Scalar* value_ptr;
Index value_nnz;
CsrMat csr_tmp;
if (Derived::needs_csr_conversion()) {
csr_tmp = CsrMat(csc);
value_ptr = csr_tmp.valuePtr();
value_nnz = csr_tmp.nonZeros();
} else {
value_ptr = csc.valuePtr();
value_nnz = csc.nonZeros();
}
eigen_assert(value_nnz == nnz_);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, value_ptr, static_cast<size_t>(nnz_) * sizeof(Scalar),
cudaMemcpyHostToDevice, stream_));
EIGEN_CUDSS_CHECK(cudssMatrixSetValues(d_A_cudss_, d_values_.ptr));
info_ = InvalidInput;
info_synced_ = false;
EIGEN_CUDSS_CHECK(
cudssExecute(handle_, CUDSS_PHASE_FACTORIZATION, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
return derived();
}
// ---- Solve ----------------------------------------------------------------
/** Solve A * X = B. Returns X as a dense matrix.
* Supports single or multiple right-hand sides. */
template <typename Rhs>
DenseMatrix solve(const MatrixBase<Rhs>& B) const {
sync_info();
eigen_assert(info_ == Success && "GpuSparseSolver::solve requires a successful factorization");
eigen_assert(B.rows() == n_);
const DenseMatrix rhs(B);
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
if (n_ == 0) return DenseMatrix(0, rhs.cols());
const size_t rhs_bytes = static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar);
DeviceBuffer d_b(rhs_bytes);
DeviceBuffer d_x(rhs_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_b.ptr, rhs.data(), rhs_bytes, cudaMemcpyHostToDevice, stream_));
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
cudssMatrix_t b_cudss = nullptr, x_cudss = nullptr;
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&b_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_b.ptr,
dtype, CUDSS_LAYOUT_COL_MAJOR));
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&x_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_x.ptr,
dtype, CUDSS_LAYOUT_COL_MAJOR));
EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_SOLVE, config_, data_, d_A_cudss_, x_cudss, b_cudss));
DenseMatrix X(n_, rhs.cols());
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_x.ptr, rhs_bytes, cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
(void)cudssMatrixDestroy(b_cudss);
(void)cudssMatrixDestroy(x_cudss);
return X;
}
// ---- Accessors ------------------------------------------------------------
ComputationInfo info() const {
sync_info();
return info_;
}
Index rows() const { return n_; }
Index cols() const { return n_; }
cudaStream_t stream() const { return stream_; }
protected:
// ---- CUDA / cuDSS handles -------------------------------------------------
cudaStream_t stream_ = nullptr;
cudssHandle_t handle_ = nullptr;
cudssConfig_t config_ = nullptr;
cudssData_t data_ = nullptr;
cudssMatrix_t d_A_cudss_ = nullptr;
cudssMatrix_t d_x_cudss_ = nullptr;
cudssMatrix_t d_b_cudss_ = nullptr;
// ---- Device buffers for CSR arrays ----------------------------------------
DeviceBuffer d_rowPtr_;
DeviceBuffer d_colIdx_;
DeviceBuffer d_values_;
// ---- State ----------------------------------------------------------------
Index n_ = 0;
Index nnz_ = 0;
ComputationInfo info_ = InvalidInput;
bool info_synced_ = true;
bool analysis_done_ = false;
GpuSparseOrdering ordering_ = GpuSparseOrdering::AMD;
private:
Derived& derived() { return static_cast<Derived&>(*this); }
const Derived& derived() const { return static_cast<const Derived&>(*this); }
void init_context() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUDSS_CHECK(cudssCreate(&handle_));
EIGEN_CUDSS_CHECK(cudssSetStream(handle_, stream_));
EIGEN_CUDSS_CHECK(cudssConfigCreate(&config_));
}
void sync_info() const {
if (!info_synced_) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
int cudss_info = 0;
EIGEN_CUDSS_CHECK(cudssDataGet(handle_, data_, CUDSS_DATA_INFO, &cudss_info, sizeof(cudss_info), nullptr));
auto* self = const_cast<GpuSparseSolverBase*>(this);
self->info_ = (cudss_info == 0) ? Success : NumericalIssue;
self->info_synced_ = true;
}
}
void destroy_cudss_objects() {
if (d_A_cudss_) {
(void)cudssMatrixDestroy(d_A_cudss_);
d_A_cudss_ = nullptr;
}
if (d_x_cudss_) {
(void)cudssMatrixDestroy(d_x_cudss_);
d_x_cudss_ = nullptr;
}
if (d_b_cudss_) {
(void)cudssMatrixDestroy(d_b_cudss_);
d_b_cudss_ = nullptr;
}
if (data_) {
(void)cudssDataDestroy(handle_, data_);
data_ = nullptr;
}
if (config_) {
(void)cudssConfigDestroy(config_);
config_ = nullptr;
}
}
// Upload CSR from a RowMajor sparse matrix (native CSR).
void upload_csr(const CsrMat& csr) { upload_compressed(csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr()); }
// Upload CSC arrays reinterpreted as CSR (for symmetric matrices: CSC(A) = CSR(A^T) = CSR(A)).
void upload_csr_from_csc(const SpMat& csc) {
upload_compressed(csc.outerIndexPtr(), csc.innerIndexPtr(), csc.valuePtr());
}
void upload_compressed(const StorageIndex* outer, const StorageIndex* inner, const Scalar* values) {
const size_t rowptr_bytes = static_cast<size_t>(n_ + 1) * sizeof(StorageIndex);
const size_t colidx_bytes = static_cast<size_t>(nnz_) * sizeof(StorageIndex);
const size_t values_bytes = static_cast<size_t>(nnz_) * sizeof(Scalar);
d_rowPtr_ = DeviceBuffer(rowptr_bytes);
d_colIdx_ = DeviceBuffer(colidx_bytes);
d_values_ = DeviceBuffer(values_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_rowPtr_.ptr, outer, rowptr_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_colIdx_.ptr, inner, colidx_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, values, values_bytes, cudaMemcpyHostToDevice, stream_));
}
void create_cudss_matrix() {
if (d_A_cudss_) (void)cudssMatrixDestroy(d_A_cudss_);
constexpr cudaDataType_t idx_type = cudss_index_type<StorageIndex>::value;
constexpr cudaDataType_t val_type = cuda_data_type<Scalar>::value;
constexpr cudssMatrixType_t mtype = Derived::cudss_matrix_type();
constexpr cudssMatrixViewType_t mview = Derived::cudss_matrix_view();
EIGEN_CUDSS_CHECK(cudssMatrixCreateCsr(
&d_A_cudss_, static_cast<int64_t>(n_), static_cast<int64_t>(n_), static_cast<int64_t>(nnz_), d_rowPtr_.ptr,
/*rowEnd=*/nullptr, d_colIdx_.ptr, d_values_.ptr, idx_type, val_type, mtype, mview, CUDSS_BASE_ZERO));
}
void apply_ordering_config() {
cudssAlgType_t alg;
switch (ordering_) {
case GpuSparseOrdering::AMD:
alg = CUDSS_ALG_DEFAULT;
break;
case GpuSparseOrdering::METIS:
alg = CUDSS_ALG_2;
break;
case GpuSparseOrdering::RCM:
alg = CUDSS_ALG_3;
break;
default:
alg = CUDSS_ALG_DEFAULT;
break;
}
EIGEN_CUDSS_CHECK(cudssConfigSet(config_, CUDSS_CONFIG_REORDERING_ALG, &alg, sizeof(alg)));
}
void create_placeholder_dense() {
if (d_x_cudss_) (void)cudssMatrixDestroy(d_x_cudss_);
if (d_b_cudss_) (void)cudssMatrixDestroy(d_b_cudss_);
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_x_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
dtype, CUDSS_LAYOUT_COL_MAJOR));
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_b_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
dtype, CUDSS_LAYOUT_COL_MAJOR));
}
};
} // namespace internal
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_SOLVER_BASE_H

View File

@@ -1,8 +1,8 @@
# Eigen GPU Module (`Eigen/GPU`) # Eigen GPU Module (`Eigen/GPU`)
GPU-accelerated dense linear algebra for Eigen users, dispatching to NVIDIA GPU-accelerated linear algebra for Eigen users, dispatching to NVIDIA CUDA
CUDA libraries (cuBLAS, cuSOLVER). Requires CUDA 11.4+. Header-only (link libraries (cuBLAS, cuSOLVER, cuFFT, cuSPARSE, cuDSS). Requires CUDA 11.4+;
against CUDA runtime, cuBLAS, and cuSOLVER). cuDSS features require CUDA 12.0+ and a separate cuDSS install. Header-only.
## Why this module ## Why this module
@@ -10,25 +10,31 @@ Eigen is the linear algebra foundation for a large ecosystem of C++ projects
in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP, in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP,
Open3D), scientific computing (Ceres, Stan), and beyond. Many of these Open3D), scientific computing (Ceres, Stan), and beyond. Many of these
projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations
without dropping down to raw CUDA library APIs. Third-party projects like without dropping down to raw CUDA library APIs.
[EigenCuda](https://github.com/NLESC-JCER/EigenCuda) and
[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically to fill
this gap, and downstream projects like
[Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
GPU-accelerated solvers through Eigen.
The `Eigen/GPU` module aims to close this gap: Existing Eigen users should be GPU sparse solvers are a particularly acute gap. Sparse factorization is the
able to move performance-critical dense linear algebra to the GPU with minimal bottleneck in SLAM, bundle adjustment, FEM, and nonlinear optimization --
code changes and without learning CUDA library APIs directly. exactly the workloads where GPU acceleration matters most. Downstream projects
like [Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
GPU-accelerated sparse solvers, and third-party projects like
[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically because
Eigen lacks them. The `Eigen/GPU` module provides GPU sparse Cholesky, LDL^T,
and LU factorization via cuDSS, alongside dense solvers (cuSOLVER), matrix
products (cuBLAS), FFT (cuFFT), and sparse matrix-vector products (cuSPARSE).
Existing Eigen users should be able to move performance-critical dense or
sparse linear algebra to the GPU with minimal code changes and without
learning CUDA library APIs directly.
## Design philosophy ## Design philosophy
**CPU and GPU coexist.** There is no global compile-time switch that replaces **CPU and GPU coexist.** There is no global compile-time switch that replaces
CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers
explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>` -- and both coexist in explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>`, `GpuSparseLLT<double>` vs
the same binary. This also lets users keep the factored matrix on device across `SimplicialLLT<SparseMatrix<double>>` -- and both coexist in the same binary.
multiple solves, something impossible with compile-time replacement. This also lets users keep the factored matrix on device across multiple solves,
something impossible with compile-time replacement.
**Familiar syntax.** GPU operations use the same expression patterns as CPU **Familiar syntax.** GPU operations use the same expression patterns as CPU
Eigen. Here is a side-by-side comparison: Eigen. Here is a side-by-side comparison:
@@ -38,6 +44,7 @@ Eigen. Here is a side-by-side comparison:
#include <Eigen/Dense> #define EIGEN_USE_GPU #include <Eigen/Dense> #define EIGEN_USE_GPU
#include <Eigen/GPU> #include <Eigen/GPU>
// Dense
MatrixXd A = ...; auto d_A = DeviceMatrix<double>::fromHost(A); MatrixXd A = ...; auto d_A = DeviceMatrix<double>::fromHost(A);
MatrixXd B = ...; auto d_B = DeviceMatrix<double>::fromHost(B); MatrixXd B = ...; auto d_B = DeviceMatrix<double>::fromHost(B);
@@ -45,11 +52,15 @@ MatrixXd C = A * B; DeviceMatrix<double> d_C = d_A * d_B;
MatrixXd X = A.llt().solve(B); DeviceMatrix<double> d_X = d_A.llt().solve(d_B); MatrixXd X = A.llt().solve(B); DeviceMatrix<double> d_X = d_A.llt().solve(d_B);
MatrixXd X = d_X.toHost(); MatrixXd X = d_X.toHost();
// Sparse (using SpMat = SparseMatrix<double>)
SimplicialLLT<SpMat> llt(A); GpuSparseLLT<double> llt(A);
VectorXd x = llt.solve(b); VectorXd x = llt.solve(b);
``` ```
The GPU version reads like CPU Eigen with explicit upload/download. The GPU version reads like CPU Eigen with explicit upload/download for dense
`operator*` dispatches to cuBLAS GEMM, `.llt().solve()` dispatches to operations, and an almost identical API for sparse solvers. Unsupported
cuSOLVER potrf + potrs. Unsupported expressions are compile errors. expressions are compile errors.
**Explicit over implicit.** Host-device transfers, stream management, and **Explicit over implicit.** Host-device transfers, stream management, and
library handle lifetimes are visible in the API. There are no hidden library handle lifetimes are visible in the API. There are no hidden
@@ -160,10 +171,109 @@ MatrixXd X2 = d_X2.toHost();
GpuLU<double> lu; GpuLU<double> lu;
lu.compute(d_A); lu.compute(d_A);
auto d_Y = lu.solve(d_B, GpuLU<double>::Transpose); // A^T Y = B auto d_Y = lu.solve(d_B, GpuLU<double>::Transpose); // A^T Y = B
// QR solve (overdetermined least squares)
GpuQR<double> qr;
qr.compute(d_A); // factorize on device (async)
auto d_X = qr.solve(d_B); // Q^H * B via ormqr, then trsm on R
MatrixXd X = d_X.toHost();
// SVD (results downloaded on access)
GpuSVD<double> svd;
svd.compute(d_A, ComputeThinU | ComputeThinV);
VectorXd S = svd.singularValues(); // downloads to host
MatrixXd U = svd.matrixU(); // downloads to host
MatrixXd VT = svd.matrixVT(); // V^T (matches cuSOLVER)
// Self-adjoint eigenvalue decomposition (results downloaded on access)
GpuSelfAdjointEigenSolver<double> es;
es.compute(d_A);
VectorXd eigenvals = es.eigenvalues(); // downloads to host
MatrixXd eigenvecs = es.eigenvectors(); // downloads to host
``` ```
The cached API keeps the factored matrix on device, avoiding redundant The cached API keeps the factored matrix on device, avoiding redundant
host-device transfers and re-factorizations. host-device transfers and re-factorizations. All solvers also accept host
matrices directly as a convenience (e.g., `GpuLLT<double> llt(A)` or
`qr.solve(B)`), which handles upload/download internally.
### Sparse direct solvers (cuDSS)
Requires cuDSS (separate install, CUDA 12.0+). Define `EIGEN_CUDSS` before
including `Eigen/GPU` and link with `-lcudss`.
```cpp
SparseMatrix<double> A = ...; // symmetric positive definite
VectorXd b = ...;
// Sparse Cholesky -- one-liner
GpuSparseLLT<double> llt(A);
VectorXd x = llt.solve(b);
// Three-phase workflow for repeated solves with the same sparsity pattern
GpuSparseLLT<double> llt;
llt.analyzePattern(A); // symbolic analysis (once)
llt.factorize(A); // numeric factorization
VectorXd x = llt.solve(b);
llt.factorize(A_new_values); // refactorize (reuses symbolic analysis)
VectorXd x2 = llt.solve(b);
// Sparse LDL^T (symmetric indefinite)
GpuSparseLDLT<double> ldlt(A);
VectorXd x = ldlt.solve(b);
// Sparse LU (general non-symmetric)
GpuSparseLU<double> lu(A);
VectorXd x = lu.solve(b);
```
### FFT (cuFFT)
```cpp
GpuFFT<float> fft;
// 1D complex-to-complex
VectorXcf X = fft.fwd(x); // forward
VectorXcf y = fft.inv(X); // inverse (scaled by 1/n)
// 1D real-to-complex / complex-to-real
VectorXcf R = fft.fwd(r); // returns n/2+1 complex (half-spectrum)
VectorXf s = fft.invReal(R, n); // C2R inverse, caller specifies n
// 2D complex-to-complex
MatrixXcf B = fft.fwd2d(A); // 2D forward
MatrixXcf C = fft.inv2d(B); // 2D inverse (scaled by 1/(rows*cols))
// Plans are cached and reused across calls with the same size/type.
```
### Sparse matrix-vector multiply (cuSPARSE)
```cpp
SparseMatrix<double> A = ...;
VectorXd x = ...;
GpuSparseContext<double> ctx;
VectorXd y = ctx.multiply(A, x); // y = A * x
VectorXd z = ctx.multiplyT(A, x); // z = A^T * x
ctx.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y
// Multiple RHS (SpMM)
MatrixXd Y = ctx.multiplyMat(A, X); // Y = A * X
```
### Precision control
GEMM dispatch enables tensor core algorithms by default, allowing cuBLAS to
choose the fastest algorithm for the given precision and architecture. For
double precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64
results computed faster via tensor cores.
| Macro | Effect |
|---|---|
| *(default)* | Tensor core algorithms enabled. Float uses full FP32. Double may use Ozaki on sm_80+. |
| `EIGEN_CUDA_TF32` | Opt-in: Float uses TF32 (~2x faster, 10-bit mantissa). Double unaffected. |
| `EIGEN_NO_CUDA_TENSOR_OPS` | Opt-out: Pedantic compute types, no tensor cores. For bit-exact reproducibility. |
### Stream control and async execution ### Stream control and async execution
@@ -190,7 +300,8 @@ skip the wait (CUDA guarantees in-order execution within a stream).
### Supported scalar types ### Supported scalar types
`float`, `double`, `std::complex<float>`, `std::complex<double>`. `float`, `double`, `std::complex<float>`, `std::complex<double>` (unless
noted otherwise).
### Expression -> library call mapping ### Expression -> library call mapping
@@ -212,29 +323,41 @@ skip the wait (CUDA guarantees in-order execution within a stream).
| `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo | | `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
| `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N | | `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
### `DeviceMatrix<Scalar>` API ### `DeviceMatrix<Scalar>`
| Method | Sync? | Description | Typed RAII wrapper for a dense column-major matrix in GPU device memory.
|--------|-------|-------------| Always dense (leading dimension = rows). A vector is a `DeviceMatrix` with
| `DeviceMatrix()` | -- | Empty (0x0) | one column.
| `DeviceMatrix(rows, cols)` | -- | Allocate uninitialized |
| `fromHost(matrix, stream)` | yes | Upload from Eigen matrix | ```cpp
| `fromHostAsync(ptr, rows, cols, outerStride, stream)` | no | Async upload (caller manages lifetime) | // Construction
| `toHost(stream)` | yes | Synchronous download | DeviceMatrix<Scalar>() // Empty (0x0)
| `toHostAsync(stream)` | no | Returns `HostTransfer` future | DeviceMatrix<Scalar>(rows, cols) // Allocate uninitialized
| `clone(stream)` | no | Device-to-device deep copy |
| `resize(rows, cols)` | -- | Discard contents, reallocate | // Upload / download
| `data()` | -- | Raw device pointer | static DeviceMatrix fromHost(matrix, stream=nullptr) // -> DeviceMatrix (syncs)
| `rows()`, `cols()` | -- | Dimensions | static DeviceMatrix fromHostAsync(ptr, rows, cols, outerStride, s) // -> DeviceMatrix (no sync, caller manages ptr lifetime)
| `sizeInBytes()` | -- | Total device allocation size in bytes | PlainMatrix toHost(stream=nullptr) // -> host Matrix (syncs)
| `empty()` | -- | True if 0x0 | HostTransfer toHostAsync(stream=nullptr) // -> HostTransfer future (no sync)
| `adjoint()` | -- | Adjoint view (GEMM ConjTrans) | DeviceMatrix clone(stream=nullptr) // -> DeviceMatrix (D2D copy, async)
| `transpose()` | -- | Transpose view (GEMM Trans) |
| `llt()` / `llt<UpLo>()` | -- | Cholesky expression builder | // Dimensions and access
| `lu()` | -- | LU expression builder | Index rows()
| `triangularView<UpLo>()` | -- | Triangular view (TRSM) | Index cols()
| `selfadjointView<UpLo>()` | -- | Self-adjoint view (SYMM, rankUpdate) | size_t sizeInBytes()
| `device(ctx)` | -- | Assignment proxy bound to context | bool empty()
Scalar* data() // Raw device pointer
void resize(Index rows, Index cols) // Discard contents, reallocate
// Expression builders (return lightweight views, evaluated on assignment)
AdjointView adjoint() // GEMM with ConjTrans
TransposeView transpose() // GEMM with Trans
LltExpr llt() / llt<UpLo>() // -> .solve(d_B) -> DeviceMatrix
LuExpr lu() // -> .solve(d_B) -> DeviceMatrix
TriangularView triangularView<UpLo>() // -> .solve(d_B) -> DeviceMatrix (TRSM)
SelfAdjointView selfadjointView<UpLo>() // -> * d_B (SYMM), .rankUpdate(d_A) (SYRK)
DeviceAssignment device(GpuContext& ctx) // Bind assignment to explicit stream
```
### `GpuContext` ### `GpuContext`
@@ -251,34 +374,190 @@ cusolverDnHandle_t cusolverHandle()
Non-copyable, non-movable (owns library handles). Non-copyable, non-movable (owns library handles).
### `GpuLLT<Scalar, UpLo>` API ### `GpuLLT<Scalar, UpLo>` -- Dense Cholesky (cuSOLVER)
GPU dense Cholesky (LL^T) via cuSOLVER. Caches factor on device. Caches the Cholesky factor on device for repeated solves.
| Method | Sync? | Description | ```cpp
|--------|-------|-------------| GpuLLT() // Default construct, then call compute()
| `GpuLLT(A)` | deferred | Construct and factorize from host matrix | GpuLLT(const EigenBase<D>& A) // Convenience: upload + factorize
| `compute(host_matrix)` | deferred | Upload and factorize |
| `compute(DeviceMatrix)` | deferred | D2D copy and factorize |
| `compute(DeviceMatrix&&)` | deferred | Move-adopt and factorize (no copy) |
| `solve(host_matrix)` | yes | Solve, return host matrix |
| `solve(DeviceMatrix)` | no | Solve, return `DeviceMatrix` (async) |
| `info()` | lazy | Syncs stream on first call, returns `Success` or `NumericalIssue` |
### `GpuLU<Scalar>` API GpuLLT& compute(const EigenBase<D>& A) // Upload + factorize
GpuLLT& compute(const DeviceMatrix& d_A) // D2D copy + factorize
GpuLLT& compute(DeviceMatrix&& d_A) // Adopt + factorize (no copy)
GPU dense partial-pivoting LU via cuSOLVER. Same pattern as `GpuLLT`, plus PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
`TransposeMode` parameter on `solve()` (`NoTranspose`, `Transpose`, DeviceMatrix solve(const DeviceMatrix& d_B) // -> DeviceMatrix (async, stays on device)
`ConjugateTranspose`).
### `HostTransfer<Scalar>` API ComputationInfo info() // Lazy sync on first call: Success or NumericalIssue
Index rows() / cols()
cudaStream_t stream()
```
Future for async device-to-host transfer. ### `GpuLU<Scalar>` -- Dense LU (cuSOLVER)
| Method | Description | Same pattern as `GpuLLT`. Adds `TransposeMode` parameter on `solve()`.
|--------|-------------|
| `get()` | Block until transfer completes, return host matrix reference. Idempotent. | ```cpp
| `ready()` | Non-blocking poll | PlainMatrix solve(const MatrixBase<D>& B, TransposeMode m = NoTranspose) // -> host Matrix
DeviceMatrix solve(const DeviceMatrix& d_B, TransposeMode m = NoTranspose) // -> DeviceMatrix
```
`TransposeMode`: `NoTranspose`, `Transpose`, `ConjugateTranspose`.
### `GpuQR<Scalar>` -- Dense QR (cuSOLVER)
QR factorization via `cusolverDnXgeqrf`. Solve uses ORMQR (apply Q^H) + TRSM
(back-substitute on R) -- Q is never formed explicitly.
```cpp
GpuQR() // Default construct
GpuQR(const EigenBase<D>& A) // Convenience: upload + factorize
GpuQR& compute(const EigenBase<D>& A) // Upload + factorize
GpuQR& compute(const DeviceMatrix& d_A) // D2D copy + factorize
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
DeviceMatrix solve(const DeviceMatrix& d_B) // -> DeviceMatrix (async)
ComputationInfo info() // Lazy sync
Index rows() / cols()
cudaStream_t stream()
```
### `GpuSVD<Scalar>` -- Dense SVD (cuSOLVER)
SVD via `cusolverDnXgesvd`. Supports `ComputeThinU | ComputeThinV`,
`ComputeFullU | ComputeFullV`, or `0` (values only). Wide matrices (m < n)
handled by internal transpose.
```cpp
GpuSVD() // Default construct, then call compute()
GpuSVD(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV) // Convenience
GpuSVD& compute(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV)
GpuSVD& compute(const DeviceMatrix& d_A, unsigned options = ComputeThinU | ComputeThinV)
RealVector singularValues() // -> host vector (syncs, downloads)
PlainMatrix matrixU() // -> host Matrix (syncs, downloads)
PlainMatrix matrixVT() // -> host Matrix (syncs, downloads V^T)
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (pseudoinverse)
PlainMatrix solve(const MatrixBase<D>& B, Index k) // Truncated (top k triplets)
PlainMatrix solve(const MatrixBase<D>& B, RealScalar l) // Tikhonov regularized
Index rank(RealScalar threshold = -1)
ComputationInfo info() // Lazy sync
Index rows() / cols()
cudaStream_t stream()
```
**Note:** `singularValues()`, `matrixU()`, and `matrixVT()` download to host
on each call. Device-side accessors returning `DeviceMatrix` are planned but
not yet implemented.
### `GpuSelfAdjointEigenSolver<Scalar>` -- Eigendecomposition (cuSOLVER)
Symmetric/Hermitian eigenvalue decomposition via `cusolverDnXsyevd`.
`ComputeMode` enum: `EigenvaluesOnly`, `ComputeEigenvectors`.
```cpp
GpuSelfAdjointEigenSolver() // Default construct, then call compute()
GpuSelfAdjointEigenSolver(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors) // Convenience
GpuSelfAdjointEigenSolver& compute(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors)
GpuSelfAdjointEigenSolver& compute(const DeviceMatrix& d_A, ComputeMode mode = ComputeEigenvectors)
RealVector eigenvalues() // -> host vector (syncs, downloads, ascending order)
PlainMatrix eigenvectors() // -> host Matrix (syncs, downloads, columns)
ComputationInfo info() // Lazy sync
Index rows() / cols()
cudaStream_t stream()
```
**Note:** `eigenvalues()` and `eigenvectors()` download to host on each call.
Device-side accessors returning `DeviceMatrix` are planned but not yet
implemented.
### `HostTransfer<Scalar>`
Future for async device-to-host transfer. Returned by
`DeviceMatrix::toHostAsync()`.
```cpp
PlainMatrix& get() // Block until complete, return host Matrix ref. Idempotent.
bool ready() // Non-blocking poll
```
### `GpuSparseLLT<Scalar, UpLo>` -- Sparse Cholesky (cuDSS)
Requires cuDSS (CUDA 12.0+, `#define EIGEN_CUDSS`). Three-phase workflow
with symbolic reuse. Accepts `SparseMatrix<Scalar, ColMajor, int>` (CSC).
```cpp
GpuSparseLLT() // Default construct
GpuSparseLLT(const SparseMatrixBase<D>& A) // Analyze + factorize
GpuSparseLLT& analyzePattern(const SparseMatrixBase<D>& A) // Symbolic analysis (reusable)
GpuSparseLLT& factorize(const SparseMatrixBase<D>& A) // Numeric factorization
GpuSparseLLT& compute(const SparseMatrixBase<D>& A) // analyzePattern + factorize
void setOrdering(GpuSparseOrdering ord) // AMD (default), METIS, or RCM
DenseMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
ComputationInfo info() // Lazy sync
Index rows() / cols()
cudaStream_t stream()
```
### `GpuSparseLDLT<Scalar, UpLo>` -- Sparse LDL^T (cuDSS)
Symmetric indefinite. Same API as `GpuSparseLLT`.
### `GpuSparseLU<Scalar>` -- Sparse LU (cuDSS)
General non-symmetric. Same API as `GpuSparseLLT` (without `UpLo`).
### `GpuFFT<Scalar>` -- FFT (cuFFT)
Plans cached by (size, type) and reused. Inverse transforms scaled so
`inv(fwd(x)) == x`. Supported scalars: `float`, `double`.
```cpp
// 1D transforms (host vectors in and out)
ComplexVector fwd(const MatrixBase<D>& x) // C2C forward (complex input)
ComplexVector fwd(const MatrixBase<D>& x) // R2C forward (real input, returns n/2+1)
ComplexVector inv(const MatrixBase<D>& X) // C2C inverse, scaled by 1/n
RealVector invReal(const MatrixBase<D>& X, Index n) // C2R inverse, scaled by 1/n
// 2D transforms (host matrices in and out)
ComplexMatrix fwd2d(const MatrixBase<D>& A) // 2D C2C forward
ComplexMatrix inv2d(const MatrixBase<D>& A) // 2D C2C inverse, scaled by 1/(rows*cols)
cudaStream_t stream()
```
All FFT methods accept host data and return host data. Upload/download is
handled internally. The C2C and R2C overloads of `fwd()` are distinguished by
the input scalar type (complex vs real).
### `GpuSparseContext<Scalar>` -- SpMV/SpMM (cuSPARSE)
Accepts `SparseMatrix<Scalar, ColMajor>`. All methods accept host data and
return host data.
```cpp
GpuSparseContext() // Creates own stream + cuSPARSE handle
DenseVector multiply(A, x) // y = A * x
void multiply(A, x, y, alpha=1, beta=0, // y = alpha*op(A)*x + beta*y
op=CUSPARSE_OPERATION_NON_TRANSPOSE)
DenseVector multiplyT(A, x) // y = A^T * x
DenseMatrix multiplyMat(A, X) // Y = A * X (SpMM)
cudaStream_t stream()
```
### Aliasing ### Aliasing
@@ -303,6 +582,18 @@ The caller must ensure operands don't alias the destination for GEMM and TRSM
| `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping | | `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization | | `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization | | `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
| `GpuQR.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense QR decomposition |
| `GpuSVD.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense SVD decomposition |
| `GpuEigenSolver.h` | `CuSolverSupport.h` | Self-adjoint eigenvalue decomposition |
| `CuFftSupport.h` | `GpuSupport.h`, `<cufft.h>` | cuFFT error macro, type-dispatch wrappers |
| `GpuFFT.h` | `CuFftSupport.h`, `CuBlasSupport.h` | 1D/2D FFT with plan caching |
| `CuSparseSupport.h` | `GpuSupport.h`, `<cusparse.h>` | cuSPARSE error macro |
| `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE |
| `CuDssSupport.h` | `GpuSupport.h`, `<cudss.h>` | cuDSS error macro, type traits (optional) |
| `GpuSparseSolverBase.h` | `CuDssSupport.h` | CRTP base for sparse solvers (optional) |
| `GpuSparseLLT.h` | `GpuSparseSolverBase.h` | Sparse Cholesky via cuDSS (optional) |
| `GpuSparseLDLT.h` | `GpuSparseSolverBase.h` | Sparse LDL^T via cuDSS (optional) |
| `GpuSparseLU.h` | `GpuSparseSolverBase.h` | Sparse LU via cuDSS (optional) |
## Building and testing ## Building and testing
@@ -313,6 +604,33 @@ cmake -G Ninja -B build -S . \
-DEIGEN_TEST_CUBLAS=ON \ -DEIGEN_TEST_CUBLAS=ON \
-DEIGEN_TEST_CUSOLVER=ON -DEIGEN_TEST_CUSOLVER=ON
cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu gpu_device_matrix cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu \
ctest --test-dir build -R "gpu_cublas|gpu_cusolver|gpu_device" --output-on-failure gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen \
gpu_device_matrix gpu_cufft gpu_cusparse_spmv
ctest --test-dir build -R "gpu_" --output-on-failure
# Sparse solvers (cuDSS -- separate install required)
cmake -G Ninja -B build -S . \
-DEIGEN_TEST_CUDA=ON \
-DEIGEN_CUDA_COMPUTE_ARCH="70" \
-DEIGEN_TEST_CUDSS=ON
cmake --build build --target gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu
ctest --test-dir build -R gpu_cudss --output-on-failure
``` ```
## Future work
- **Device-side accessors for decomposition results.** `GpuSVD`,
`GpuSelfAdjointEigenSolver`, and `GpuQR` currently download decomposition
results to host on access (e.g., `svd.matrixU()` returns a host `MatrixXd`).
Device-side accessors returning `DeviceMatrix` views of the internal buffers
would allow chaining GPU operations (e.g., `svd.deviceU() * d_A`) without
round-tripping through host memory.
- **Device-resident sparse matrix-vector products.** `GpuSparseContext`
currently operates on host vectors and matrices, uploading and downloading
on each call. The key missing piece is a `DeviceSparseView` that holds a
sparse matrix on device and supports operator syntax (`d_y = d_A * d_x`)
with `DeviceMatrix` operands -- keeping the entire SpMV/SpMM pipeline on
device. This is essential for iterative solvers and any workflow that chains
sparse and dense operations without returning to the host.

View File

@@ -51,3 +51,7 @@ eigen_add_gpu_benchmark(bench_gpu_chaining_float bench_gpu_chaining.cpp DEFINITI
# Batching benchmarks: multi-stream concurrency for many small systems. # Batching benchmarks: multi-stream concurrency for many small systems.
eigen_add_gpu_benchmark(bench_gpu_batching bench_gpu_batching.cpp) eigen_add_gpu_benchmark(bench_gpu_batching bench_gpu_batching.cpp)
eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITIONS SCALAR=float) eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITIONS SCALAR=float)
# FFT benchmarks: 1D/2D C2C, R2C, C2R throughput and plan reuse.
eigen_add_gpu_benchmark(bench_gpu_fft bench_gpu_fft.cpp LIBRARIES CUDA::cufft)
eigen_add_gpu_benchmark(bench_gpu_fft_double bench_gpu_fft.cpp LIBRARIES CUDA::cufft DEFINITIONS SCALAR=double)

View File

@@ -0,0 +1,185 @@
// GPU FFT benchmarks: GpuFFT 1D and 2D throughput.
//
// Measures forward and inverse FFT performance across a range of sizes,
// including plan-amortized (reuse) and cold-start (new plan) scenarios.
//
// Usage:
// cmake --build build-bench-gpu --target bench_gpu_fft
// ./build-bench-gpu/bench_gpu_fft
//
// Profiling:
// nsys profile --trace=cuda ./build-bench-gpu/bench_gpu_fft
#include <benchmark/benchmark.h>
#include <Eigen/GPU>
using namespace Eigen;
#ifndef SCALAR
#define SCALAR float
#endif
using Scalar = SCALAR;
using Complex = std::complex<Scalar>;
using CVec = Matrix<Complex, Dynamic, 1>;
using RVec = Matrix<Scalar, Dynamic, 1>;
using CMat = Matrix<Complex, Dynamic, Dynamic>;
// CUDA warm-up: ensure the GPU is initialized before timing.
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 1);
cudaFree(p);
done = true;
}
}
// --------------------------------------------------------------------------
// 1D C2C Forward
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_C2C_Fwd(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
CVec x = CVec::Random(n);
GpuFFT<Scalar> fft;
// Warm up plan.
CVec tmp = fft.fwd(x);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.fwd(x));
}
state.SetItemsProcessed(state.iterations() * n);
state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2); // read + write
}
BENCHMARK(BM_GpuFFT_1D_C2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
// --------------------------------------------------------------------------
// 1D C2C Inverse
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_C2C_Inv(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
CVec x = CVec::Random(n);
GpuFFT<Scalar> fft;
CVec X = fft.fwd(x);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.inv(X));
}
state.SetItemsProcessed(state.iterations() * n);
state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);
}
BENCHMARK(BM_GpuFFT_1D_C2C_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
// --------------------------------------------------------------------------
// 1D R2C Forward
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_R2C_Fwd(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
RVec r = RVec::Random(n);
GpuFFT<Scalar> fft;
// Warm up plan.
CVec tmp = fft.fwd(r);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.fwd(r));
}
state.SetItemsProcessed(state.iterations() * n);
state.SetBytesProcessed(state.iterations() * (n * sizeof(Scalar) + (n / 2 + 1) * sizeof(Complex)));
}
BENCHMARK(BM_GpuFFT_1D_R2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
// --------------------------------------------------------------------------
// 1D C2R Inverse
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_C2R_Inv(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
RVec r = RVec::Random(n);
GpuFFT<Scalar> fft;
CVec R = fft.fwd(r);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.invReal(R, n));
}
state.SetItemsProcessed(state.iterations() * n);
state.SetBytesProcessed(state.iterations() * ((n / 2 + 1) * sizeof(Complex) + n * sizeof(Scalar)));
}
BENCHMARK(BM_GpuFFT_1D_C2R_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
// --------------------------------------------------------------------------
// 2D C2C Forward
// --------------------------------------------------------------------------
static void BM_GpuFFT_2D_C2C_Fwd(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0); // square n x n
CMat A = CMat::Random(n, n);
GpuFFT<Scalar> fft;
// Warm up plan.
CMat tmp = fft.fwd2d(A);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.fwd2d(A));
}
state.SetItemsProcessed(state.iterations() * n * n);
state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 2);
}
BENCHMARK(BM_GpuFFT_2D_C2C_Fwd)->RangeMultiplier(2)->Range(64, 4096);
// --------------------------------------------------------------------------
// 2D C2C Roundtrip (fwd + inv)
// --------------------------------------------------------------------------
static void BM_GpuFFT_2D_C2C_Roundtrip(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
CMat A = CMat::Random(n, n);
GpuFFT<Scalar> fft;
// Warm up plans.
CMat tmp = fft.inv2d(fft.fwd2d(A));
for (auto _ : state) {
CMat B = fft.fwd2d(A);
benchmark::DoNotOptimize(fft.inv2d(B));
}
state.SetItemsProcessed(state.iterations() * n * n * 2); // fwd + inv
state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 4);
}
BENCHMARK(BM_GpuFFT_2D_C2C_Roundtrip)->RangeMultiplier(2)->Range(64, 4096);
// --------------------------------------------------------------------------
// 1D Cold start (includes plan creation)
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_ColdStart(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
CVec x = CVec::Random(n);
for (auto _ : state) {
GpuFFT<Scalar> fft; // new object = new plans
benchmark::DoNotOptimize(fft.fwd(x));
}
state.SetItemsProcessed(state.iterations() * n);
}
BENCHMARK(BM_GpuFFT_1D_ColdStart)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);

View File

@@ -528,7 +528,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
# compiler and linked against CUDA runtime + cuSOLVER. This avoids NVCC # compiler and linked against CUDA runtime + cuSOLVER. This avoids NVCC
# instantiating Eigen's CPU packet operations for CUDA vector types. # instantiating Eigen's CPU packet operations for CUDA vector types.
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
foreach(_cusolver_test IN ITEMS gpu_cusolver_llt gpu_cusolver_lu) foreach(_cusolver_test IN ITEMS gpu_cusolver_llt gpu_cusolver_lu gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen)
add_executable(${_cusolver_test} ${_cusolver_test}.cpp) add_executable(${_cusolver_test} ${_cusolver_test}.cpp)
target_include_directories(${_cusolver_test} PRIVATE target_include_directories(${_cusolver_test} PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include" "${CUDA_TOOLKIT_ROOT_DIR}/include"
@@ -547,11 +547,86 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
endif() endif()
# cuFFT test (cuFFT is part of the CUDA toolkit — no separate option needed).
if(TARGET CUDA::cufft)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
add_executable(gpu_cufft gpu_cufft.cpp)
target_include_directories(gpu_cufft PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(gpu_cufft
Eigen3::Eigen CUDA::cudart CUDA::cufft CUDA::cublas)
target_compile_definitions(gpu_cufft PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1)
add_test(NAME gpu_cufft COMMAND gpu_cufft)
add_dependencies(buildtests gpu_cufft)
add_dependencies(buildtests_gpu gpu_cufft)
set_property(TEST gpu_cufft APPEND PROPERTY LABELS "Official;gpu")
set_property(TEST gpu_cufft PROPERTY SKIP_RETURN_CODE 77)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
endif()
# cuSPARSE SpMV test (cuSPARSE is part of the CUDA toolkit).
if(TARGET CUDA::cusparse)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
add_executable(gpu_cusparse_spmv gpu_cusparse_spmv.cpp)
target_include_directories(gpu_cusparse_spmv PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(gpu_cusparse_spmv
Eigen3::Eigen CUDA::cudart CUDA::cusparse)
target_compile_definitions(gpu_cusparse_spmv PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1)
add_test(NAME gpu_cusparse_spmv COMMAND gpu_cusparse_spmv)
add_dependencies(buildtests gpu_cusparse_spmv)
add_dependencies(buildtests_gpu gpu_cusparse_spmv)
set_property(TEST gpu_cusparse_spmv APPEND PROPERTY LABELS "Official;gpu")
set_property(TEST gpu_cusparse_spmv PROPERTY SKIP_RETURN_CODE 77)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
endif()
option(EIGEN_TEST_CUSPARSE "Test cuSPARSE integration" OFF) option(EIGEN_TEST_CUSPARSE "Test cuSPARSE integration" OFF)
if(EIGEN_TEST_CUSPARSE AND TARGET CUDA::cusparse) if(EIGEN_TEST_CUSPARSE AND TARGET CUDA::cusparse)
ei_add_test(gpu_cusparse "" "CUDA::cusparse") ei_add_test(gpu_cusparse "" "CUDA::cusparse")
endif() endif()
# cuDSS sparse direct solver tests.
# cuDSS is distributed separately from the CUDA Toolkit.
option(EIGEN_TEST_CUDSS "Test cuDSS sparse solver integration" OFF)
if(EIGEN_TEST_CUDSS)
find_path(CUDSS_INCLUDE_DIR cudss.h
HINTS ${CUDSS_DIR}/include ${CUDA_TOOLKIT_ROOT_DIR}/include /usr/include)
find_library(CUDSS_LIBRARY cudss
HINTS ${CUDSS_DIR}/lib ${CUDSS_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib/x86_64-linux-gnu)
if(CUDSS_INCLUDE_DIR AND CUDSS_LIBRARY)
message(STATUS "cuDSS found: ${CUDSS_LIBRARY}")
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
foreach(_cudss_test IN ITEMS gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu)
add_executable(${_cudss_test} ${_cudss_test}.cpp)
target_include_directories(${_cudss_test} PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CUDSS_INCLUDE_DIR}"
"${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(${_cudss_test}
Eigen3::Eigen CUDA::cudart CUDA::cusolver CUDA::cublas ${CUDSS_LIBRARY})
target_compile_definitions(${_cudss_test} PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1
EIGEN_CUDSS=1)
add_test(NAME ${_cudss_test} COMMAND "${_cudss_test}")
add_dependencies(buildtests ${_cudss_test})
add_dependencies(buildtests_gpu ${_cudss_test})
set_property(TEST ${_cudss_test} APPEND PROPERTY LABELS "Official;gpu")
set_property(TEST ${_cudss_test} PROPERTY SKIP_RETURN_CODE 77)
endforeach()
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
else()
message(WARNING "EIGEN_TEST_CUDSS=ON but cuDSS not found. Set CUDSS_DIR.")
endif()
endif()
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
endif() endif()

View File

@@ -16,6 +16,32 @@
using namespace Eigen; using namespace Eigen;
// Unit roundoff for GPU GEMM compute precision.
// TF32 (opt-in via EIGEN_CUDA_TF32) has eps ~ 2^{-10}.
template <typename Scalar>
typename NumTraits<Scalar>::Real gpu_unit_roundoff() {
#if defined(EIGEN_CUDA_TF32) && !defined(EIGEN_NO_CUDA_TENSOR_OPS)
using RealScalar = typename NumTraits<Scalar>::Real;
if (std::is_same<RealScalar, float>::value) return RealScalar(9.8e-4);
#endif
return NumTraits<Scalar>::epsilon();
}
// Higham-Mary probabilistic error bound for GEMM:
// ||C - fl(C)||_F <= lambda * sqrt(k) * u * ||A||_F * ||B||_F
// where k is the inner dimension, u is the unit roundoff, and
// lambda = sqrt(2 * ln(2/delta)) with delta = failure probability.
// lambda = 5 corresponds to delta ~ 10^{-6}.
// Reference: Higham & Mary, "Probabilistic Error Analysis for Inner Products",
// SIAM J. Matrix Anal. Appl., 2019.
template <typename Scalar>
typename NumTraits<Scalar>::Real gemm_error_bound(Index k, typename NumTraits<Scalar>::Real normA,
typename NumTraits<Scalar>::Real normB) {
using RealScalar = typename NumTraits<Scalar>::Real;
constexpr RealScalar lambda = 5;
return lambda * std::sqrt(static_cast<RealScalar>(k)) * gpu_unit_roundoff<Scalar>() * normA * normB;
}
// ---- Basic GEMM: C = A * B ------------------------------------------------- // ---- Basic GEMM: C = A * B -------------------------------------------------
template <typename Scalar> template <typename Scalar>
@@ -36,7 +62,7 @@ void test_gemm_basic(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A * B; Mat C_ref = A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -59,7 +85,7 @@ void test_gemm_adjoint_lhs(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A.adjoint() * B; Mat C_ref = A.adjoint() * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -82,7 +108,7 @@ void test_gemm_transpose_rhs(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A * B.transpose(); Mat C_ref = A * B.transpose();
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -106,7 +132,7 @@ void test_gemm_scaled(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = alpha * A * B; Mat C_ref = alpha * A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -130,7 +156,7 @@ void test_gemm_accumulate(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = C_init + A * B; Mat C_ref = C_init + A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -153,7 +179,7 @@ void test_gemm_accumulate_empty(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A * B; Mat C_ref = A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -178,7 +204,7 @@ void test_gemm_subtract(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = C_init - A * B; Mat C_ref = C_init - A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -202,7 +228,7 @@ void test_gemm_subtract_empty(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = -(A * B); Mat C_ref = -(A * B);
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -226,7 +252,7 @@ void test_gemm_scaled_rhs(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A * (alpha * B); Mat C_ref = A * (alpha * B);
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -266,7 +292,7 @@ void test_gemm_explicit_context(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A * B; Mat C_ref = A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -296,7 +322,7 @@ void test_gemm_cross_context_reuse(Index n) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A * B + D * E; Mat C_ref = A * B + D * E;
RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm()) + gemm_error_bound<Scalar>(n, D.norm(), E.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -326,7 +352,7 @@ void test_gemm_cross_context_resize() {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = D * E; Mat C_ref = D * E;
RealScalar tol = RealScalar(16) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(16, D.norm(), E.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -353,7 +379,9 @@ void test_gemm_chain(Index n) {
Mat D = d_D.toHost(); Mat D = d_D.toHost();
Mat D_ref = (A * B) * E; Mat D_ref = (A * B) * E;
RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * D_ref.norm(); Mat C_ref = A * B;
RealScalar tol =
gemm_error_bound<Scalar>(n, A.norm(), B.norm()) * E.norm() + gemm_error_bound<Scalar>(n, C_ref.norm(), E.norm());
VERIFY((D - D_ref).norm() < tol); VERIFY((D - D_ref).norm() < tol);
} }
@@ -401,7 +429,7 @@ void test_llt_solve_expr(Index n, Index nrhs) {
Mat X = d_X.toHost(); Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm(); RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
} }
// ---- LLT solve with explicit context ---------------------------------------- // ---- LLT solve with explicit context ----------------------------------------
@@ -423,7 +451,7 @@ void test_llt_solve_expr_context(Index n, Index nrhs) {
Mat X = d_X.toHost(); Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm(); RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
} }
// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------ // ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------
@@ -444,7 +472,7 @@ void test_lu_solve_expr(Index n, Index nrhs) {
Mat X = d_X.toHost(); Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm()); RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon()); VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
} }
// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------ // ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------
@@ -474,7 +502,7 @@ void test_gemm_then_solve(Index n) {
Mat X = d_X.toHost(); Mat X = d_X.toHost();
RealScalar residual = (C * X - D).norm() / D.norm(); RealScalar residual = (C * X - D).norm() / D.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
} }
// ---- LLT solve with Upper triangle ----------------------------------------- // ---- LLT solve with Upper triangle -----------------------------------------
@@ -495,7 +523,7 @@ void test_llt_solve_upper(Index n, Index nrhs) {
Mat X = d_X.toHost(); Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm(); RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
} }
// ---- LU solve with explicit context ----------------------------------------- // ---- LU solve with explicit context -----------------------------------------
@@ -517,7 +545,7 @@ void test_lu_solve_expr_context(Index n, Index nrhs) {
Mat X = d_X.toHost(); Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm()); RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon()); VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
} }
// ---- Zero-nrhs solver expressions ------------------------------------------ // ---- Zero-nrhs solver expressions ------------------------------------------
@@ -581,7 +609,7 @@ void test_trsm(Index n, Index nrhs) {
Mat X = d_X.toHost(); Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm(); RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
} }
// ---- SYMM/HEMM: selfadjointView<UpLo>() * B -------------------------------- // ---- SYMM/HEMM: selfadjointView<UpLo>() * B --------------------------------
@@ -603,7 +631,7 @@ void test_symm(Index n, Index nrhs) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A * B; // A is symmetric, so full multiply == symm Mat C_ref = A * B; // A is symmetric, so full multiply == symm
RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -629,7 +657,7 @@ void test_syrk(Index n, Index k) {
Mat C_lower = C.template triangularView<Lower>(); Mat C_lower = C.template triangularView<Lower>();
Mat C_ref_lower = C_ref.template triangularView<Lower>(); Mat C_ref_lower = C_ref.template triangularView<Lower>();
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), A.norm());
VERIFY((C_lower - C_ref_lower).norm() < tol); VERIFY((C_lower - C_ref_lower).norm() < tol);
} }

154
test/gpu_cudss_ldlt.cpp Normal file
View File

@@ -0,0 +1,154 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSparseLDLT: GPU sparse LDL^T via cuDSS.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a random sparse symmetric indefinite matrix ---------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_symmetric_indefinite(Index n, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
// Build a random sparse matrix and symmetrize it.
// The diagonal has mixed signs to ensure indefiniteness.
SpMat R(n, n);
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
for (Index j = 0; j < n; ++j) {
for (Index i = 0; i < n; ++i) {
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
}
}
}
R.makeCompressed();
// A = R + R^H (symmetric), then add diagonal with alternating signs for indefiniteness.
SpMat A = R + SparseMatrix<Scalar, ColMajor, int>(R.adjoint());
for (Index i = 0; i < n; ++i) {
Scalar diag_val = Scalar((i % 2 == 0) ? n : -n);
A.coeffRef(i, i) += diag_val;
}
A.makeCompressed();
return A;
}
// ---- Solve and check residual -----------------------------------------------
template <typename Scalar>
void test_solve(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_symmetric_indefinite<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLDLT<Scalar> ldlt(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
Vec x = ldlt.solve(b);
VERIFY_IS_EQUAL(x.rows(), n);
Vec r = A * x - b;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(r.norm() / b.norm() < tol);
}
// ---- Multiple RHS -----------------------------------------------------------
template <typename Scalar>
void test_multiple_rhs(Index n, Index nrhs) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_symmetric_indefinite<Scalar>(n);
Mat B = Mat::Random(n, nrhs);
GpuSparseLDLT<Scalar> ldlt(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
Mat X = ldlt.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
Mat R = A * X - B;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(R.norm() / B.norm() < tol);
}
// ---- Refactorize ------------------------------------------------------------
template <typename Scalar>
void test_refactorize(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_symmetric_indefinite<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLDLT<Scalar> ldlt;
ldlt.analyzePattern(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
ldlt.factorize(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
Vec x1 = ldlt.solve(b);
// Modify values, keep pattern.
SpMat A2 = A;
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
ldlt.factorize(A2);
VERIFY_IS_EQUAL(ldlt.info(), Success);
Vec x2 = ldlt.solve(b);
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((A * x1 - b).norm() / b.norm() < tol);
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
}
// ---- Empty ------------------------------------------------------------------
void test_empty() {
using SpMat = SparseMatrix<double, ColMajor, int>;
SpMat A(0, 0);
A.makeCompressed();
GpuSparseLDLT<double> ldlt(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
VERIFY_IS_EQUAL(ldlt.rows(), 0);
VERIFY_IS_EQUAL(ldlt.cols(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_solve<Scalar>(64));
CALL_SUBTEST(test_solve<Scalar>(256));
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
CALL_SUBTEST(test_refactorize<Scalar>(64));
}
EIGEN_DECLARE_TEST(gpu_cudss_ldlt) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_empty());
}

202
test/gpu_cudss_llt.cpp Normal file
View File

@@ -0,0 +1,202 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSparseLLT: GPU sparse Cholesky via cuDSS.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a random sparse SPD matrix -------------------------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using RealScalar = typename NumTraits<Scalar>::Real;
// Uses the global std::rand state seeded by the test framework (g_seed).
SpMat R(n, n);
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
for (Index j = 0; j < n; ++j) {
for (Index i = 0; i < n; ++i) {
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
}
}
}
R.makeCompressed();
// A = R^H * R + n * I (guaranteed SPD).
SpMat A = R.adjoint() * R;
for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
A.makeCompressed();
return A;
}
// ---- Solve and check residual -----------------------------------------------
template <typename Scalar>
void test_solve(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLLT<Scalar> llt(A);
VERIFY_IS_EQUAL(llt.info(), Success);
Vec x = llt.solve(b);
VERIFY_IS_EQUAL(x.rows(), n);
// Check residual: ||Ax - b|| / ||b||.
Vec r = A * x - b;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(r.norm() / b.norm() < tol);
}
// ---- Compare with CPU SimplicialLLT -----------------------------------------
template <typename Scalar>
void test_vs_cpu(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLLT<Scalar> gpu_llt(A);
VERIFY_IS_EQUAL(gpu_llt.info(), Success);
Vec x_gpu = gpu_llt.solve(b);
SimplicialLLT<SpMat> cpu_llt(A);
VERIFY_IS_EQUAL(cpu_llt.info(), Success);
Vec x_cpu = cpu_llt.solve(b);
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((x_gpu - x_cpu).norm() / x_cpu.norm() < tol);
}
// ---- Multiple RHS -----------------------------------------------------------
template <typename Scalar>
void test_multiple_rhs(Index n, Index nrhs) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Mat B = Mat::Random(n, nrhs);
GpuSparseLLT<Scalar> llt(A);
VERIFY_IS_EQUAL(llt.info(), Success);
Mat X = llt.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
Mat R = A * X - B;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(R.norm() / B.norm() < tol);
}
// ---- Separate analyze + factorize (refactorization) -------------------------
template <typename Scalar>
void test_refactorize(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLLT<Scalar> llt;
llt.analyzePattern(A);
VERIFY_IS_EQUAL(llt.info(), Success);
// First factorize + solve.
llt.factorize(A);
VERIFY_IS_EQUAL(llt.info(), Success);
Vec x1 = llt.solve(b);
// Modify values (keep same pattern): scale diagonal.
SpMat A2 = A;
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
// Refactorize with same pattern.
llt.factorize(A2);
VERIFY_IS_EQUAL(llt.info(), Success);
Vec x2 = llt.solve(b);
// Both solutions should satisfy their respective systems.
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((A * x1 - b).norm() / b.norm() < tol);
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
// Solutions should differ (A2 != A).
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
}
// ---- Empty matrix -----------------------------------------------------------
void test_empty() {
using SpMat = SparseMatrix<double, ColMajor, int>;
SpMat A(0, 0);
A.makeCompressed();
GpuSparseLLT<double> llt(A);
VERIFY_IS_EQUAL(llt.info(), Success);
VERIFY_IS_EQUAL(llt.rows(), 0);
VERIFY_IS_EQUAL(llt.cols(), 0);
}
// ---- Upper triangle ---------------------------------------------------------
template <typename Scalar>
void test_upper(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLLT<Scalar, Upper> llt(A);
VERIFY_IS_EQUAL(llt.info(), Success);
Vec x = llt.solve(b);
Vec r = A * x - b;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(r.norm() / b.norm() < tol);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_solve<Scalar>(64));
CALL_SUBTEST(test_solve<Scalar>(256));
CALL_SUBTEST(test_vs_cpu<Scalar>(64));
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
CALL_SUBTEST(test_refactorize<Scalar>(64));
CALL_SUBTEST(test_upper<Scalar>(64));
}
EIGEN_DECLARE_TEST(gpu_cudss_llt) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_empty());
}

147
test/gpu_cudss_lu.cpp Normal file
View File

@@ -0,0 +1,147 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSparseLU: GPU sparse LU via cuDSS.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a random sparse non-singular general matrix ---------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_general(Index n, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat R(n, n);
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
for (Index j = 0; j < n; ++j) {
for (Index i = 0; i < n; ++i) {
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
}
}
}
// Add strong diagonal for non-singularity.
for (Index i = 0; i < n; ++i) R.coeffRef(i, i) += Scalar(RealScalar(n));
R.makeCompressed();
return R;
}
// ---- Solve and check residual -----------------------------------------------
template <typename Scalar>
void test_solve(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_general<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLU<Scalar> lu(A);
VERIFY_IS_EQUAL(lu.info(), Success);
Vec x = lu.solve(b);
VERIFY_IS_EQUAL(x.rows(), n);
Vec r = A * x - b;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(r.norm() / b.norm() < tol);
}
// ---- Multiple RHS -----------------------------------------------------------
template <typename Scalar>
void test_multiple_rhs(Index n, Index nrhs) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_general<Scalar>(n);
Mat B = Mat::Random(n, nrhs);
GpuSparseLU<Scalar> lu(A);
VERIFY_IS_EQUAL(lu.info(), Success);
Mat X = lu.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
Mat R = A * X - B;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(R.norm() / B.norm() < tol);
}
// ---- Refactorize ------------------------------------------------------------
template <typename Scalar>
void test_refactorize(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_general<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLU<Scalar> lu;
lu.analyzePattern(A);
VERIFY_IS_EQUAL(lu.info(), Success);
lu.factorize(A);
VERIFY_IS_EQUAL(lu.info(), Success);
Vec x1 = lu.solve(b);
// Modify values, keep pattern.
SpMat A2 = A;
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
lu.factorize(A2);
VERIFY_IS_EQUAL(lu.info(), Success);
Vec x2 = lu.solve(b);
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((A * x1 - b).norm() / b.norm() < tol);
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
}
// ---- Empty ------------------------------------------------------------------
void test_empty() {
using SpMat = SparseMatrix<double, ColMajor, int>;
SpMat A(0, 0);
A.makeCompressed();
GpuSparseLU<double> lu(A);
VERIFY_IS_EQUAL(lu.info(), Success);
VERIFY_IS_EQUAL(lu.rows(), 0);
VERIFY_IS_EQUAL(lu.cols(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_solve<Scalar>(64));
CALL_SUBTEST(test_solve<Scalar>(256));
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
CALL_SUBTEST(test_refactorize<Scalar>(64));
}
EIGEN_DECLARE_TEST(gpu_cudss_lu) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_empty());
}

186
test/gpu_cufft.cpp Normal file
View File

@@ -0,0 +1,186 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuFFT: GPU FFT via cuFFT.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/GPU>
using namespace Eigen;
// ---- 1D C2C roundtrip: inv(fwd(x)) ≈ x -------------------------------------
template <typename Scalar>
void test_c2c_roundtrip(Index n) {
using Complex = std::complex<Scalar>;
using Vec = Matrix<Complex, Dynamic, 1>;
using RealScalar = Scalar;
Vec x = Vec::Random(n);
GpuFFT<Scalar> fft;
Vec X = fft.fwd(x);
VERIFY_IS_EQUAL(X.size(), n);
Vec y = fft.inv(X);
VERIFY_IS_EQUAL(y.size(), n);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y - x).norm() / x.norm() < tol);
}
// ---- 1D C2C known signal: FFT of constant = delta --------------------------
template <typename Scalar>
void test_c2c_constant() {
using Complex = std::complex<Scalar>;
using Vec = Matrix<Complex, Dynamic, 1>;
using RealScalar = Scalar;
const int n = 64;
Vec x = Vec::Constant(n, Complex(3.0, 0.0));
GpuFFT<Scalar> fft;
Vec X = fft.fwd(x);
// FFT of constant c: X[0] = c*n, X[k] = 0 for k > 0.
RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(n);
VERIFY(std::abs(X(0) - Complex(3.0 * n, 0.0)) < tol);
for (int k = 1; k < n; ++k) {
VERIFY(std::abs(X(k)) < tol);
}
}
// ---- 1D R2C/C2R roundtrip: invReal(fwd(r), n) ≈ r --------------------------
template <typename Scalar>
void test_r2c_roundtrip(Index n) {
using Complex = std::complex<Scalar>;
using CVec = Matrix<Complex, Dynamic, 1>;
using RVec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = Scalar;
RVec r = RVec::Random(n);
GpuFFT<Scalar> fft;
CVec R = fft.fwd(r);
// R2C returns n/2+1 complex values.
VERIFY_IS_EQUAL(R.size(), n / 2 + 1);
RVec s = fft.invReal(R, n);
VERIFY_IS_EQUAL(s.size(), n);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((s - r).norm() / r.norm() < tol);
}
// ---- 2D C2C roundtrip: inv2d(fwd2d(A)) ≈ A ---------------------------------
template <typename Scalar>
void test_2d_roundtrip(Index rows, Index cols) {
using Complex = std::complex<Scalar>;
using Mat = Matrix<Complex, Dynamic, Dynamic>;
using RealScalar = Scalar;
Mat A = Mat::Random(rows, cols);
GpuFFT<Scalar> fft;
Mat B = fft.fwd2d(A);
VERIFY_IS_EQUAL(B.rows(), rows);
VERIFY_IS_EQUAL(B.cols(), cols);
Mat C = fft.inv2d(B);
VERIFY_IS_EQUAL(C.rows(), rows);
VERIFY_IS_EQUAL(C.cols(), cols);
RealScalar tol = RealScalar(10) * RealScalar(rows * cols) * NumTraits<Scalar>::epsilon();
VERIFY((C - A).norm() / A.norm() < tol);
}
// ---- 2D C2C known signal: constant matrix -----------------------------------
template <typename Scalar>
void test_2d_constant() {
using Complex = std::complex<Scalar>;
using Mat = Matrix<Complex, Dynamic, Dynamic>;
using RealScalar = Scalar;
const int rows = 16, cols = 32;
Mat A = Mat::Constant(rows, cols, Complex(2.0, 0.0));
GpuFFT<Scalar> fft;
Mat B = fft.fwd2d(A);
// 2D FFT of constant c: B(0,0) = c*rows*cols, all others = 0.
RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(rows * cols);
VERIFY(std::abs(B(0, 0) - Complex(2.0 * rows * cols, 0.0)) < tol);
for (int j = 0; j < cols; ++j) {
for (int i = 0; i < rows; ++i) {
if (i == 0 && j == 0) continue;
VERIFY(std::abs(B(i, j)) < tol);
}
}
}
// ---- Plan reuse: repeated calls should work ---------------------------------
template <typename Scalar>
void test_plan_reuse() {
using Complex = std::complex<Scalar>;
using Vec = Matrix<Complex, Dynamic, 1>;
using RealScalar = Scalar;
GpuFFT<Scalar> fft;
for (int trial = 0; trial < 5; ++trial) {
Vec x = Vec::Random(128);
Vec X = fft.fwd(x);
Vec y = fft.inv(X);
RealScalar tol = RealScalar(10) * RealScalar(128) * NumTraits<Scalar>::epsilon();
VERIFY((y - x).norm() / x.norm() < tol);
}
}
// ---- Empty ------------------------------------------------------------------
template <typename Scalar>
void test_empty() {
using Complex = std::complex<Scalar>;
using Vec = Matrix<Complex, Dynamic, 1>;
GpuFFT<Scalar> fft;
Vec x(0);
Vec X = fft.fwd(x);
VERIFY_IS_EQUAL(X.size(), 0);
Vec y = fft.inv(X);
VERIFY_IS_EQUAL(y.size(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(64));
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(256));
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(1000)); // non-power-of-2
CALL_SUBTEST(test_c2c_constant<Scalar>());
CALL_SUBTEST(test_r2c_roundtrip<Scalar>(64));
CALL_SUBTEST(test_r2c_roundtrip<Scalar>(256));
CALL_SUBTEST(test_2d_roundtrip<Scalar>(32, 32));
CALL_SUBTEST(test_2d_roundtrip<Scalar>(16, 64)); // non-square
CALL_SUBTEST(test_2d_constant<Scalar>());
CALL_SUBTEST(test_plan_reuse<Scalar>());
CALL_SUBTEST(test_empty<Scalar>());
}
EIGEN_DECLARE_TEST(gpu_cufft) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
}

180
test/gpu_cusolver_eigen.cpp Normal file
View File

@@ -0,0 +1,180 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSelfAdjointEigenSolver: GPU symmetric/Hermitian eigenvalue
// decomposition via cuSOLVER.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Eigenvalues>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Reconstruction: V * diag(W) * V^H ≈ A ---------------------------------
template <typename Scalar>
void test_eigen_reconstruction(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
// Build a symmetric/Hermitian matrix.
Mat R = Mat::Random(n, n);
Mat A = R + R.adjoint();
GpuSelfAdjointEigenSolver<Scalar> es(A);
VERIFY_IS_EQUAL(es.info(), Success);
auto W = es.eigenvalues();
Mat V = es.eigenvectors();
VERIFY_IS_EQUAL(W.size(), n);
VERIFY_IS_EQUAL(V.rows(), n);
VERIFY_IS_EQUAL(V.cols(), n);
// Reconstruct: A_hat = V * diag(W) * V^H.
Mat A_hat = V * W.asDiagonal() * V.adjoint();
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
VERIFY((A_hat - A).norm() < tol);
// Orthogonality: V^H * V ≈ I.
Mat VhV = V.adjoint() * V;
Mat eye = Mat::Identity(n, n);
VERIFY((VhV - eye).norm() < tol);
}
// ---- Eigenvalues match CPU SelfAdjointEigenSolver ---------------------------
template <typename Scalar>
void test_eigen_values(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat R = Mat::Random(n, n);
Mat A = R + R.adjoint();
GpuSelfAdjointEigenSolver<Scalar> gpu_es(A);
VERIFY_IS_EQUAL(gpu_es.info(), Success);
auto W_gpu = gpu_es.eigenvalues();
SelfAdjointEigenSolver<Mat> cpu_es(A);
auto W_cpu = cpu_es.eigenvalues();
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() *
W_cpu.cwiseAbs().maxCoeff();
VERIFY((W_gpu - W_cpu).norm() < tol);
}
// ---- Eigenvalues-only mode --------------------------------------------------
template <typename Scalar>
void test_eigen_values_only(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat R = Mat::Random(n, n);
Mat A = R + R.adjoint();
GpuSelfAdjointEigenSolver<Scalar> gpu_es(A, GpuSelfAdjointEigenSolver<Scalar>::EigenvaluesOnly);
VERIFY_IS_EQUAL(gpu_es.info(), Success);
auto W_gpu = gpu_es.eigenvalues();
SelfAdjointEigenSolver<Mat> cpu_es(A, EigenvaluesOnly);
auto W_cpu = cpu_es.eigenvalues();
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() *
W_cpu.cwiseAbs().maxCoeff();
VERIFY((W_gpu - W_cpu).norm() < tol);
}
// ---- DeviceMatrix input path ------------------------------------------------
template <typename Scalar>
void test_eigen_device_matrix(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat R = Mat::Random(n, n);
Mat A = R + R.adjoint();
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
GpuSelfAdjointEigenSolver<Scalar> es;
es.compute(d_A);
VERIFY_IS_EQUAL(es.info(), Success);
auto W_gpu = es.eigenvalues();
Mat V = es.eigenvectors();
// Verify reconstruction.
Mat A_hat = V * W_gpu.asDiagonal() * V.adjoint();
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
VERIFY((A_hat - A).norm() < tol);
}
// ---- Recompute (reuse solver object) ----------------------------------------
template <typename Scalar>
void test_eigen_recompute(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
GpuSelfAdjointEigenSolver<Scalar> es;
for (int trial = 0; trial < 3; ++trial) {
Mat R = Mat::Random(n, n);
Mat A = R + R.adjoint();
es.compute(A);
VERIFY_IS_EQUAL(es.info(), Success);
auto W = es.eigenvalues();
Mat V = es.eigenvectors();
Mat A_hat = V * W.asDiagonal() * V.adjoint();
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
VERIFY((A_hat - A).norm() < tol);
}
}
// ---- Empty matrix -----------------------------------------------------------
void test_eigen_empty() {
GpuSelfAdjointEigenSolver<double> es(MatrixXd(0, 0));
VERIFY_IS_EQUAL(es.info(), Success);
VERIFY_IS_EQUAL(es.rows(), 0);
VERIFY_IS_EQUAL(es.cols(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
// Reconstruction + orthogonality.
CALL_SUBTEST(test_eigen_reconstruction<Scalar>(64));
CALL_SUBTEST(test_eigen_reconstruction<Scalar>(128));
// Eigenvalues match CPU.
CALL_SUBTEST(test_eigen_values<Scalar>(64));
CALL_SUBTEST(test_eigen_values<Scalar>(128));
// Values-only mode.
CALL_SUBTEST(test_eigen_values_only<Scalar>(64));
// DeviceMatrix input.
CALL_SUBTEST(test_eigen_device_matrix<Scalar>(64));
// Recompute.
CALL_SUBTEST(test_eigen_recompute<Scalar>(32));
}
EIGEN_DECLARE_TEST(gpu_cusolver_eigen) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_eigen_empty());
}

185
test/gpu_cusolver_qr.cpp Normal file
View File

@@ -0,0 +1,185 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuQR: GPU QR decomposition via cuSOLVER.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/QR>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Solve square system: A * X = B -----------------------------------------
template <typename Scalar>
void test_qr_solve_square(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
GpuQR<Scalar> qr(A);
VERIFY_IS_EQUAL(qr.info(), Success);
Mat X = qr.solve(B);
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
}
// ---- Solve overdetermined system: m > n (least-squares) ---------------------
template <typename Scalar>
void test_qr_solve_overdetermined(Index m, Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
eigen_assert(m >= n);
Mat A = Mat::Random(m, n);
Mat B = Mat::Random(m, nrhs);
GpuQR<Scalar> qr(A);
VERIFY_IS_EQUAL(qr.info(), Success);
Mat X = qr.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
// Compare with CPU QR.
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
RealScalar tol = RealScalar(100) * RealScalar(m) * NumTraits<Scalar>::epsilon();
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
}
// ---- Solve with DeviceMatrix input ------------------------------------------
template <typename Scalar>
void test_qr_solve_device(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
GpuQR<Scalar> qr;
qr.compute(d_A);
VERIFY_IS_EQUAL(qr.info(), Success);
DeviceMatrix<Scalar> d_X = qr.solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
}
// ---- Solve overdetermined via device path -----------------------------------
template <typename Scalar>
void test_qr_solve_overdetermined_device(Index m, Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
eigen_assert(m >= n);
Mat A = Mat::Random(m, n);
Mat B = Mat::Random(m, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
GpuQR<Scalar> qr;
qr.compute(d_A);
VERIFY_IS_EQUAL(qr.info(), Success);
DeviceMatrix<Scalar> d_X = qr.solve(d_B);
VERIFY_IS_EQUAL(d_X.rows(), n);
VERIFY_IS_EQUAL(d_X.cols(), nrhs);
Mat X = d_X.toHost();
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
RealScalar tol = RealScalar(100) * RealScalar(m) * NumTraits<Scalar>::epsilon();
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
}
// ---- Multiple solves reuse the factorization --------------------------------
template <typename Scalar>
void test_qr_multiple_solves(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
GpuQR<Scalar> qr(A);
VERIFY_IS_EQUAL(qr.info(), Success);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
for (int k = 0; k < 5; ++k) {
Mat B = Mat::Random(n, 3);
Mat X = qr.solve(B);
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < tol);
}
}
// ---- Agreement with CPU HouseholderQR ---------------------------------------
template <typename Scalar>
void test_qr_vs_cpu(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
GpuQR<Scalar> gpu_qr(A);
VERIFY_IS_EQUAL(gpu_qr.info(), Success);
Mat X_gpu = gpu_qr.solve(B);
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_qr_solve_square<Scalar>(1, 1));
CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 1));
CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 4));
CALL_SUBTEST(test_qr_solve_square<Scalar>(256, 8));
CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(128, 64, 4));
CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(256, 128, 1));
CALL_SUBTEST(test_qr_solve_device<Scalar>(64, 4));
CALL_SUBTEST(test_qr_solve_overdetermined_device<Scalar>(128, 64, 4));
CALL_SUBTEST(test_qr_multiple_solves<Scalar>(64));
CALL_SUBTEST(test_qr_vs_cpu<Scalar>(64, 4));
CALL_SUBTEST(test_qr_vs_cpu<Scalar>(256, 8));
}
void test_qr_empty() {
GpuQR<double> qr(MatrixXd(0, 0));
VERIFY_IS_EQUAL(qr.info(), Success);
VERIFY_IS_EQUAL(qr.rows(), 0);
VERIFY_IS_EQUAL(qr.cols(), 0);
}
EIGEN_DECLARE_TEST(gpu_cusolver_qr) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_qr_empty());
}

194
test/gpu_cusolver_svd.cpp Normal file
View File

@@ -0,0 +1,194 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSVD: GPU SVD via cuSOLVER.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/SVD>
#include <Eigen/GPU>
using namespace Eigen;
// ---- SVD reconstruction: U * diag(S) * VT ≈ A ------------------------------
template <typename Scalar, unsigned int Options>
void test_svd_reconstruction(Index m, Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, n);
GpuSVD<Scalar> svd(A, Options);
VERIFY_IS_EQUAL(svd.info(), Success);
auto S = svd.singularValues();
Mat U = svd.matrixU();
Mat VT = svd.matrixVT();
const Index k = (std::min)(m, n);
// Reconstruct: A_hat = U[:,:k] * diag(S) * VT[:k,:].
Mat A_hat = U.leftCols(k) * S.asDiagonal() * VT.topRows(k);
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(k)) * NumTraits<Scalar>::epsilon() * A.norm();
VERIFY((A_hat - A).norm() < tol);
// Orthogonality: U^H * U ≈ I.
Mat UtU = U.adjoint() * U;
Mat I_u = Mat::Identity(U.cols(), U.cols());
VERIFY((UtU - I_u).norm() < tol);
// Orthogonality: VT * VT^H ≈ I.
Mat VtVh = VT * VT.adjoint();
Mat I_v = Mat::Identity(VT.rows(), VT.rows());
VERIFY((VtVh - I_v).norm() < tol);
}
// ---- Singular values match CPU BDCSVD ---------------------------------------
template <typename Scalar>
void test_svd_singular_values(Index m, Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, n);
GpuSVD<Scalar> svd(A, 0); // values only
VERIFY_IS_EQUAL(svd.info(), Success);
auto S_gpu = svd.singularValues();
auto S_cpu = BDCSVD<Mat>(A, 0).singularValues();
RealScalar tol =
RealScalar(5) * std::sqrt(static_cast<RealScalar>((std::min)(m, n))) * NumTraits<Scalar>::epsilon() * S_cpu(0);
VERIFY((S_gpu - S_cpu).norm() < tol);
}
// ---- Solve: pseudoinverse ---------------------------------------------------
template <typename Scalar>
void test_svd_solve(Index m, Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, n);
Mat B = Mat::Random(m, nrhs);
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
VERIFY_IS_EQUAL(svd.info(), Success);
Mat X = svd.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
// Compare with CPU BDCSVD solve.
Mat X_cpu = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV).solve(B);
RealScalar tol = RealScalar(100) * RealScalar((std::max)(m, n)) * NumTraits<Scalar>::epsilon();
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
}
// ---- Solve: truncated -------------------------------------------------------
template <typename Scalar>
void test_svd_solve_truncated(Index m, Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, n);
Mat B = Mat::Random(m, 1);
const Index k = (std::min)(m, n);
const Index trunc = k / 2;
eigen_assert(trunc > 0);
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
Mat X_trunc = svd.solve(B, trunc);
// Build CPU reference: truncated pseudoinverse.
auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
auto S = cpu_svd.singularValues();
Mat U = cpu_svd.matrixU();
Mat V = cpu_svd.matrixV();
// D_ii = 1/S_i for i < trunc, 0 otherwise.
Matrix<RealScalar, Dynamic, 1> D = Matrix<RealScalar, Dynamic, 1>::Zero(k);
for (Index i = 0; i < trunc; ++i) D(i) = RealScalar(1) / S(i);
Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
VERIFY((X_trunc - X_ref).norm() / X_ref.norm() < tol);
}
// ---- Solve: Tikhonov regularized --------------------------------------------
template <typename Scalar>
void test_svd_solve_regularized(Index m, Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, n);
Mat B = Mat::Random(m, 1);
RealScalar lambda = RealScalar(0.1);
const Index k = (std::min)(m, n);
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
Mat X_reg = svd.solve(B, lambda);
// CPU reference: D_ii = S_i / (S_i^2 + lambda^2).
auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
auto S = cpu_svd.singularValues();
Mat U = cpu_svd.matrixU();
Mat V = cpu_svd.matrixV();
Matrix<RealScalar, Dynamic, 1> D(k);
for (Index i = 0; i < k; ++i) D(i) = S(i) / (S(i) * S(i) + lambda * lambda);
Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
VERIFY((X_reg - X_ref).norm() / X_ref.norm() < tol);
}
// ---- Empty matrix -----------------------------------------------------------
void test_svd_empty() {
GpuSVD<double> svd(MatrixXd(0, 0), 0);
VERIFY_IS_EQUAL(svd.info(), Success);
VERIFY_IS_EQUAL(svd.rows(), 0);
VERIFY_IS_EQUAL(svd.cols(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
// Reconstruction + orthogonality (thin and full, identical test logic).
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 64)));
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(128, 64)));
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 128))); // wide (m < n)
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(64, 64)));
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(128, 64)));
// Singular values.
CALL_SUBTEST(test_svd_singular_values<Scalar>(64, 64));
CALL_SUBTEST(test_svd_singular_values<Scalar>(128, 64));
// Solve.
CALL_SUBTEST(test_svd_solve<Scalar>(64, 64, 4));
CALL_SUBTEST(test_svd_solve<Scalar>(128, 64, 4));
CALL_SUBTEST(test_svd_solve<Scalar>(64, 128, 4)); // wide (m < n)
// Truncated and regularized solve.
CALL_SUBTEST(test_svd_solve_truncated<Scalar>(64, 64));
CALL_SUBTEST(test_svd_solve_regularized<Scalar>(64, 64));
}
EIGEN_DECLARE_TEST(gpu_cusolver_svd) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_svd_empty());
}

203
test/gpu_cusparse_spmv.cpp Normal file
View File

@@ -0,0 +1,203 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSparseContext: GPU SpMV/SpMM via cuSPARSE.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a random sparse matrix -----------------------------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_sparse(Index rows, Index cols, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat R(rows, cols);
R.reserve(VectorXi::Constant(cols, static_cast<int>(rows * density) + 1));
for (Index j = 0; j < cols; ++j) {
for (Index i = 0; i < rows; ++i) {
if ((std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(RealScalar(std::rand() / double(RAND_MAX) - 0.5));
}
}
}
R.makeCompressed();
return R;
}
// ---- SpMV: y = A * x -------------------------------------------------------
template <typename Scalar>
void test_spmv(Index rows, Index cols) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(rows, cols);
Vec x = Vec::Random(cols);
GpuSparseContext<Scalar> ctx;
Vec y_gpu = ctx.multiply(A, x);
Vec y_cpu = A * x;
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
VERIFY_IS_EQUAL(y_gpu.size(), rows);
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- SpMV with alpha/beta: y = alpha*A*x + beta*y ---------------------------
template <typename Scalar>
void test_spmv_alpha_beta(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(n, n);
Vec x = Vec::Random(n);
Vec y_init = Vec::Random(n);
Scalar alpha(2);
Scalar beta(3);
Vec y_cpu = alpha * (A * x) + beta * y_init;
GpuSparseContext<Scalar> ctx;
Vec y_gpu = y_init;
ctx.multiply(A, x, y_gpu, alpha, beta);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- Transpose: y = A^T * x ------------------------------------------------
template <typename Scalar>
void test_spmv_transpose(Index rows, Index cols) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(rows, cols);
Vec x = Vec::Random(rows);
GpuSparseContext<Scalar> ctx;
Vec y_gpu = ctx.multiplyT(A, x);
Vec y_cpu = A.transpose() * x;
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
VERIFY_IS_EQUAL(y_gpu.size(), cols);
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- SpMM: Y = A * X (multiple RHS) ----------------------------------------
template <typename Scalar>
void test_spmm(Index rows, Index cols, Index nrhs) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(rows, cols);
Mat X = Mat::Random(cols, nrhs);
GpuSparseContext<Scalar> ctx;
Mat Y_gpu = ctx.multiplyMat(A, X);
Mat Y_cpu = A * X;
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
VERIFY_IS_EQUAL(Y_gpu.rows(), rows);
VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);
VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- Identity matrix: I * x = x --------------------------------------------
template <typename Scalar>
void test_identity(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
// Build sparse identity.
SpMat eye(n, n);
eye.setIdentity();
eye.makeCompressed();
Vec x = Vec::Random(n);
GpuSparseContext<Scalar> ctx;
Vec y = ctx.multiply(eye, x);
RealScalar tol = NumTraits<Scalar>::epsilon();
VERIFY((y - x).norm() < tol);
}
// ---- Context reuse ----------------------------------------------------------
template <typename Scalar>
void test_reuse(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
GpuSparseContext<Scalar> ctx;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
for (int trial = 0; trial < 3; ++trial) {
SpMat A = make_sparse<Scalar>(n, n);
Vec x = Vec::Random(n);
Vec y_gpu = ctx.multiply(A, x);
Vec y_cpu = A * x;
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
}
// ---- Empty ------------------------------------------------------------------
template <typename Scalar>
void test_empty() {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
SpMat A(0, 0);
A.makeCompressed();
Vec x(0);
GpuSparseContext<Scalar> ctx;
Vec y = ctx.multiply(A, x);
VERIFY_IS_EQUAL(y.size(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_spmv<Scalar>(64, 64));
CALL_SUBTEST(test_spmv<Scalar>(128, 64)); // non-square
CALL_SUBTEST(test_spmv<Scalar>(64, 128)); // wide
CALL_SUBTEST(test_spmv_alpha_beta<Scalar>(64));
CALL_SUBTEST(test_spmv_transpose<Scalar>(128, 64));
CALL_SUBTEST(test_spmm<Scalar>(64, 64, 4));
CALL_SUBTEST(test_identity<Scalar>(64));
CALL_SUBTEST(test_reuse<Scalar>(64));
CALL_SUBTEST(test_empty<Scalar>());
}
EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
}

View File

@@ -35,7 +35,6 @@ void test_allocate(Index rows, Index cols) {
VERIFY(!dm.empty()); VERIFY(!dm.empty());
VERIFY_IS_EQUAL(dm.rows(), rows); VERIFY_IS_EQUAL(dm.rows(), rows);
VERIFY_IS_EQUAL(dm.cols(), cols); VERIFY_IS_EQUAL(dm.cols(), cols);
VERIFY_IS_EQUAL(dm.outerStride(), rows);
VERIFY(dm.data() != nullptr); VERIFY(dm.data() != nullptr);
VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar)); VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar));
} }
@@ -69,7 +68,7 @@ void test_roundtrip_async(Index rows, Index cols) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream)); EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream));
// Async upload from raw pointer. // Async upload from raw pointer.
auto dm = DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, stream); auto dm = DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, stream);
VERIFY_IS_EQUAL(dm.rows(), rows); VERIFY_IS_EQUAL(dm.rows(), rows);
VERIFY_IS_EQUAL(dm.cols(), cols); VERIFY_IS_EQUAL(dm.cols(), cols);
@@ -185,7 +184,6 @@ void test_resize() {
dm.resize(50, 30); dm.resize(50, 30);
VERIFY_IS_EQUAL(dm.rows(), 50); VERIFY_IS_EQUAL(dm.rows(), 50);
VERIFY_IS_EQUAL(dm.cols(), 30); VERIFY_IS_EQUAL(dm.cols(), 30);
VERIFY_IS_EQUAL(dm.outerStride(), 50);
VERIFY(dm.data() != nullptr); VERIFY(dm.data() != nullptr);
// Resize to same dimensions is a no-op. // Resize to same dimensions is a no-op.