Compare commits

...

3 Commits

Author SHA1 Message Date
Rasmus Munk Larsen
014f12f11a GPU: Add BLAS-1 ops, DeviceScalar, device-resident SpMV, and CG interop (5/5)
Add the operator interface needed for GPU iterative solvers:

- BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(),
  noalias(), operator+=/-=/\*= dispatching to cuBLAS axpy/scal/dot/nrm2.
- DeviceScalar<Scalar>: device-resident scalar returned by reductions.
  Defers host sync until value is read (implicit conversion). Device-side
  division via NPP for real types.
- GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(),
  cusparseHandle().
- GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm
  selection and plan caching.
- GpuSparseContext: GpuContext& constructor for same-stream execution,
  deviceView() returning DeviceSparseView with operator* for device-resident
  SpMV (d_y = d_A * d_x).
- geam expressions: d_C = d_A + alpha * d_B via cublasXgeam.
- GpuSVD::matrixV() convenience wrapper.

These additions make DeviceMatrix usable as a VectorType in Eigen algorithm
templates. Conjugate gradient is the motivating example and is tested against
CPU ConjugateGradient for correctness.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 20:19:59 -07:00
Rasmus Munk Larsen
43a95b62bb GPU: Add sparse solvers, FFT, and SpMV (cuDSS, cuFFT, cuSPARSE)
Add GPU sparse direct solvers (Cholesky, LDL^T, LU) via cuDSS, 1D/2D FFT
via cuFFT with plan caching, and sparse matrix-vector/matrix multiply
(SpMV/SpMM) via cuSPARSE.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 19:11:49 -07:00
Rasmus Munk Larsen
8593c7f5a1 GPU: Add dense cuSOLVER solvers (QR, SVD, EigenSolver)
Add QR (geqrf + ormqr + trsm), SVD (gesvd), and self-adjoint eigenvalue
decomposition (syevd) via cuSOLVER. All support host and DeviceMatrix input.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 19:11:34 -07:00
44 changed files with 8157 additions and 198 deletions

View File

@@ -39,6 +39,7 @@
#ifdef EIGEN_USE_GPU #ifdef EIGEN_USE_GPU
// IWYU pragma: begin_exports // IWYU pragma: begin_exports
#include "src/GPU/DeviceScalar.h"
#include "src/GPU/DeviceMatrix.h" #include "src/GPU/DeviceMatrix.h"
#include "src/GPU/GpuContext.h" #include "src/GPU/GpuContext.h"
#include "src/GPU/DeviceExpr.h" #include "src/GPU/DeviceExpr.h"
@@ -47,6 +48,22 @@
#include "src/GPU/DeviceDispatch.h" #include "src/GPU/DeviceDispatch.h"
#include "src/GPU/GpuLLT.h" #include "src/GPU/GpuLLT.h"
#include "src/GPU/GpuLU.h" #include "src/GPU/GpuLU.h"
#include "src/GPU/GpuQR.h"
#include "src/GPU/GpuSVD.h"
#include "src/GPU/GpuEigenSolver.h"
#include "src/GPU/CuFftSupport.h"
#include "src/GPU/GpuFFT.h"
#include "src/GPU/CuSparseSupport.h"
#ifdef EIGEN_SPARSECORE_MODULE_H
#include "src/GPU/GpuSparseContext.h"
#endif
#if defined(EIGEN_CUDSS) && defined(EIGEN_SPARSECORE_MODULE_H)
#include "src/GPU/CuDssSupport.h"
#include "src/GPU/GpuSparseSolverBase.h"
#include "src/GPU/GpuSparseLLT.h"
#include "src/GPU/GpuSparseLDLT.h"
#include "src/GPU/GpuSparseLU.h"
#endif
// IWYU pragma: end_exports // IWYU pragma: end_exports
#endif #endif

View File

@@ -21,6 +21,7 @@
#include "./GpuSupport.h" #include "./GpuSupport.h"
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cublasLt.h>
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
@@ -50,27 +51,170 @@ constexpr cublasOperation_t to_cublas_op(GpuOp op) {
} }
// ---- Scalar → cublasComputeType_t ------------------------------------------- // ---- Scalar → cublasComputeType_t -------------------------------------------
// cublasGemmEx requires a compute type (separate from the data type). // cublasLtMatmul requires a compute type (separate from the data type).
//
// Precision policy:
// - Default: tensor core algorithms enabled via cublasLtMatmul heuristics.
// For double, cuBLAS may use Ozaki emulation on sm_80+ tensor cores.
// - EIGEN_CUDA_TF32: opt-in to TF32 for float (~2x faster, 10-bit mantissa).
// - EIGEN_NO_CUDA_TENSOR_OPS: disables all tensor core usage. Uses pedantic
// compute types. For bit-exact reproducibility.
template <typename Scalar> template <typename Scalar>
struct cuda_compute_type; struct cuda_compute_type;
template <> template <>
struct cuda_compute_type<float> { struct cuda_compute_type<float> {
#if defined(EIGEN_NO_CUDA_TENSOR_OPS)
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_PEDANTIC;
#elif defined(EIGEN_CUDA_TF32)
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_FAST_TF32;
#else
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F; static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
#endif
}; };
template <> template <>
struct cuda_compute_type<double> { struct cuda_compute_type<double> {
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F_PEDANTIC;
#else
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F; static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
#endif
}; };
template <> template <>
struct cuda_compute_type<std::complex<float>> { struct cuda_compute_type<std::complex<float>> {
#if defined(EIGEN_NO_CUDA_TENSOR_OPS)
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_PEDANTIC;
#elif defined(EIGEN_CUDA_TF32)
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_FAST_TF32;
#else
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F; static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
#endif
}; };
template <> template <>
struct cuda_compute_type<std::complex<double>> { struct cuda_compute_type<std::complex<double>> {
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F_PEDANTIC;
#else
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F; static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
#endif
}; };
// ---- Alpha/beta scalar type for cublasLtMatmul ------------------------------
// For standard types, alpha/beta match the scalar type.
template <typename Scalar>
struct cuda_gemm_scalar {
using type = Scalar;
};
// ---- cublasLt GEMM dispatch -------------------------------------------------
// Wraps cublasLtMatmul with descriptor setup, heuristic algorithm selection,
// and lazy workspace management. Supports 64-bit dimensions natively.
//
// The workspace buffer (DeviceBuffer*) is grown lazily to match the selected
// algorithm's actual requirement. The heuristic is queried with a generous
// 32 MB cap so that the best algorithm is never excluded. Growth is monotonic:
// the buffer only grows, never shrinks, so reallocation happens at most a few
// times during the lifetime of the owning GpuContext or solver.
//
// EIGEN_NO_CUDA_TENSOR_OPS: pedantic compute types (CUBLAS_COMPUTE_32F_PEDANTIC,
// CUBLAS_COMPUTE_64F_PEDANTIC) prevent cublasLt from selecting tensor core
// algorithms, matching the previous cublasGemmEx behavior.
//
// Thread safety: the workspace buffer is not thread-safe. All GEMM calls
// sharing a workspace must be on the same CUDA stream (guaranteed by GpuContext's
// single-stream design and by each GpuSVD owning its own stream).
//
// Future optimization: for hot loops (e.g., CG iteration), caching descriptors
// and the selected algorithm by (m, n, k, dtype, transA, transB) would avoid
// per-call descriptor creation and heuristic lookup overhead.
#define EIGEN_CUBLASLT_CHECK(expr) \
do { \
cublasStatus_t _s = (expr); \
eigen_assert(_s == CUBLAS_STATUS_SUCCESS && "cuBLASLt call failed"); \
} while (0)
// Maximum workspace the heuristic is allowed to consider. This is a preference
// ceiling, not an allocation — actual allocation matches the selected algorithm.
static constexpr size_t kCublasLtMaxWorkspaceBytes = 32 * 1024 * 1024; // 32 MB
// cublasGemmEx fallback algorithm hint (used when cublasLt heuristic returns no results).
constexpr cublasGemmAlgo_t cuda_gemm_algo() {
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
return CUBLAS_GEMM_DEFAULT;
#else
return CUBLAS_GEMM_DEFAULT_TENSOR_OP;
#endif
}
template <typename Scalar>
void cublaslt_gemm(cublasLtHandle_t lt_handle, cublasHandle_t cublas_handle, cublasOperation_t transA,
cublasOperation_t transB, int64_t m, int64_t n, int64_t k,
const typename cuda_gemm_scalar<Scalar>::type* alpha, const Scalar* A, int64_t lda, const Scalar* B,
int64_t ldb, const typename cuda_gemm_scalar<Scalar>::type* beta, Scalar* C, int64_t ldc,
DeviceBuffer* workspace, cudaStream_t stream) {
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
using AlphaType = typename cuda_gemm_scalar<Scalar>::type;
constexpr cudaDataType_t alpha_type = cuda_data_type<AlphaType>::value;
// Matmul descriptor.
cublasLtMatmulDesc_t matmul_desc = nullptr;
EIGEN_CUBLASLT_CHECK(cublasLtMatmulDescCreate(&matmul_desc, compute, alpha_type));
EIGEN_CUBLASLT_CHECK(
cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, sizeof(transA)));
EIGEN_CUBLASLT_CHECK(
cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, sizeof(transB)));
// Matrix layout descriptors (column-major).
// Physical layout dimensions: rows × cols with leading dimension lda/ldb/ldc.
const int64_t a_rows = (transA == CUBLAS_OP_N) ? m : k;
const int64_t b_rows = (transB == CUBLAS_OP_N) ? k : n;
cublasLtMatrixLayout_t layout_A = nullptr, layout_B = nullptr, layout_C = nullptr;
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_A, dtype, a_rows, (transA == CUBLAS_OP_N) ? k : m, lda));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_B, dtype, b_rows, (transB == CUBLAS_OP_N) ? n : k, ldb));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_C, dtype, m, n, ldc));
// Heuristic selection: query with generous workspace cap, allocate only what's needed.
cublasLtMatmulPreference_t preference = nullptr;
EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceCreate(&preference));
size_t max_ws = kCublasLtMaxWorkspaceBytes;
EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&max_ws, sizeof(max_ws)));
cublasLtMatmulHeuristicResult_t result;
int returned_results = 0;
cublasStatus_t heuristic_status = cublasLtMatmulAlgoGetHeuristic(lt_handle, matmul_desc, layout_A, layout_B, layout_C,
layout_C, preference, 1, &result, &returned_results);
if (heuristic_status == CUBLAS_STATUS_SUCCESS && returned_results > 0) {
// cublasLt path: use the selected algorithm with lazy workspace.
const size_t needed = result.workspaceSize;
if (needed > workspace->size()) {
// Sync only when freeing an existing buffer that may be in use.
if (workspace->ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
*workspace = DeviceBuffer(needed);
}
EIGEN_CUBLASLT_CHECK(cublasLtMatmul(lt_handle, matmul_desc, alpha, A, layout_A, B, layout_B, beta, C, layout_C, C,
layout_C, &result.algo, workspace->ptr, needed, stream));
} else {
// Fallback: cublasGemmEx for shapes/types that cublasLt cannot handle.
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_handle, transA, transB, static_cast<int>(m), static_cast<int>(n),
static_cast<int>(k), alpha, A, dtype, static_cast<int>(lda), B, dtype,
static_cast<int>(ldb), beta, C, dtype, static_cast<int>(ldc), compute,
cuda_gemm_algo()));
}
// Cleanup descriptors.
EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceDestroy(preference));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_C));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_B));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_A));
EIGEN_CUBLASLT_CHECK(cublasLtMatmulDescDestroy(matmul_desc));
}
// ---- Type-specific cuBLAS wrappers ------------------------------------------ // ---- Type-specific cuBLAS wrappers ------------------------------------------
// cuBLAS uses separate functions per type (Strsm, Dtrsm, etc.). // cuBLAS uses separate functions per type (Strsm, Dtrsm, etc.).
@@ -154,6 +298,129 @@ inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cubla
reinterpret_cast<cuDoubleComplex*>(C), ldc); reinterpret_cast<cuDoubleComplex*>(C), ldc);
} }
// GEAM wrappers: C = alpha * op(A) + beta * op(B)
// Covers transpose, scale, matrix add/subtract in one call.
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
const float* alpha, const float* A, int lda, const float* beta, const float* B,
int ldb, float* C, int ldc) {
return cublasSgeam(h, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
}
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
const double* alpha, const double* A, int lda, const double* beta, const double* B,
int ldb, double* C, int ldc) {
return cublasDgeam(h, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
}
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
const std::complex<float>* alpha, const std::complex<float>* A, int lda,
const std::complex<float>* beta, const std::complex<float>* B, int ldb,
std::complex<float>* C, int ldc) {
return cublasCgeam(h, transa, transb, m, n, reinterpret_cast<const cuComplex*>(alpha),
reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<const cuComplex*>(beta),
reinterpret_cast<const cuComplex*>(B), ldb, reinterpret_cast<cuComplex*>(C), ldc);
}
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
const std::complex<double>* alpha, const std::complex<double>* A, int lda,
const std::complex<double>* beta, const std::complex<double>* B, int ldb,
std::complex<double>* C, int ldc) {
return cublasZgeam(h, transa, transb, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<const cuDoubleComplex*>(beta),
reinterpret_cast<const cuDoubleComplex*>(B), ldb, reinterpret_cast<cuDoubleComplex*>(C), ldc);
}
// ---- cuBLAS Level-1 wrappers ------------------------------------------------
// Type-dispatched wrappers for BLAS-1 vector operations: dot, axpy, nrm2, scal, copy.
// These work with CUBLAS_POINTER_MODE_HOST or CUBLAS_POINTER_MODE_DEVICE depending
// on the caller's configuration. For device pointer mode, scalar result pointers
// (dot, nrm2) must point to device memory.
// dot: result = x^T * y (real) or x^H * y (complex conjugate dot)
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const float* x, int incx, const float* y, int incy,
float* result) {
return cublasSdot(h, n, x, incx, y, incy, result);
}
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const double* x, int incx, const double* y, int incy,
double* result) {
return cublasDdot(h, n, x, incx, y, incy, result);
}
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const std::complex<float>* x, int incx,
const std::complex<float>* y, int incy, std::complex<float>* result) {
return cublasCdotc(h, n, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<const cuComplex*>(y), incy,
reinterpret_cast<cuComplex*>(result));
}
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const std::complex<double>* x, int incx,
const std::complex<double>* y, int incy, std::complex<double>* result) {
return cublasZdotc(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx,
reinterpret_cast<const cuDoubleComplex*>(y), incy, reinterpret_cast<cuDoubleComplex*>(result));
}
// nrm2: result = ||x||_2 (always returns real)
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const float* x, int incx, float* result) {
return cublasSnrm2(h, n, x, incx, result);
}
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const double* x, int incx, double* result) {
return cublasDnrm2(h, n, x, incx, result);
}
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const std::complex<float>* x, int incx, float* result) {
return cublasScnrm2(h, n, reinterpret_cast<const cuComplex*>(x), incx, result);
}
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const std::complex<double>* x, int incx, double* result) {
return cublasDznrm2(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx, result);
}
// axpy: y += alpha * x
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const float* alpha, const float* x, int incx, float* y,
int incy) {
return cublasSaxpy(h, n, alpha, x, incx, y, incy);
}
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const double* alpha, const double* x, int incx, double* y,
int incy) {
return cublasDaxpy(h, n, alpha, x, incx, y, incy);
}
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const std::complex<float>* alpha,
const std::complex<float>* x, int incx, std::complex<float>* y, int incy) {
return cublasCaxpy(h, n, reinterpret_cast<const cuComplex*>(alpha), reinterpret_cast<const cuComplex*>(x), incx,
reinterpret_cast<cuComplex*>(y), incy);
}
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const std::complex<double>* alpha,
const std::complex<double>* x, int incx, std::complex<double>* y, int incy) {
return cublasZaxpy(h, n, reinterpret_cast<const cuDoubleComplex*>(alpha), reinterpret_cast<const cuDoubleComplex*>(x),
incx, reinterpret_cast<cuDoubleComplex*>(y), incy);
}
// scal: x *= alpha
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const float* alpha, float* x, int incx) {
return cublasSscal(h, n, alpha, x, incx);
}
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const double* alpha, double* x, int incx) {
return cublasDscal(h, n, alpha, x, incx);
}
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const std::complex<float>* alpha, std::complex<float>* x,
int incx) {
return cublasCscal(h, n, reinterpret_cast<const cuComplex*>(alpha), reinterpret_cast<cuComplex*>(x), incx);
}
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const std::complex<double>* alpha, std::complex<double>* x,
int incx) {
return cublasZscal(h, n, reinterpret_cast<const cuDoubleComplex*>(alpha), reinterpret_cast<cuDoubleComplex*>(x),
incx);
}
// copy: y = x
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const float* x, int incx, float* y, int incy) {
return cublasScopy(h, n, x, incx, y, incy);
}
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const double* x, int incx, double* y, int incy) {
return cublasDcopy(h, n, x, incx, y, incy);
}
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const std::complex<float>* x, int incx,
std::complex<float>* y, int incy) {
return cublasCcopy(h, n, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<cuComplex*>(y), incy);
}
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const std::complex<double>* x, int incx,
std::complex<double>* y, int incy) {
return cublasZcopy(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx, reinterpret_cast<cuDoubleComplex*>(y),
incy);
}
} // namespace internal } // namespace internal
} // namespace Eigen } // namespace Eigen

View File

@@ -0,0 +1,134 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// cuDSS support utilities: error checking macro, type mapping.
//
// cuDSS is NVIDIA's sparse direct solver library, supporting Cholesky (LL^T),
// LDL^T, and LU factorization on GPU. It requires CUDA 12.0+ and is
// distributed separately from the CUDA Toolkit.
#ifndef EIGEN_GPU_CUDSS_SUPPORT_H
#define EIGEN_GPU_CUDSS_SUPPORT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSupport.h"
#include <cudss.h>
namespace Eigen {
namespace internal {
// ---- Error checking ---------------------------------------------------------
#define EIGEN_CUDSS_CHECK(x) \
do { \
cudssStatus_t _s = (x); \
eigen_assert(_s == CUDSS_STATUS_SUCCESS && "cuDSS call failed: " #x); \
EIGEN_UNUSED_VARIABLE(_s); \
} while (0)
// ---- Scalar → cudssMatrixType_t for SPD/HPD ---------------------------------
template <typename Scalar>
struct cudss_spd_type;
template <>
struct cudss_spd_type<float> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
};
template <>
struct cudss_spd_type<double> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
};
template <>
struct cudss_spd_type<std::complex<float>> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
};
template <>
struct cudss_spd_type<std::complex<double>> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
};
// ---- Scalar → cudssMatrixType_t for symmetric/Hermitian ---------------------
template <typename Scalar>
struct cudss_symmetric_type;
template <>
struct cudss_symmetric_type<float> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
};
template <>
struct cudss_symmetric_type<double> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
};
template <>
struct cudss_symmetric_type<std::complex<float>> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
};
template <>
struct cudss_symmetric_type<std::complex<double>> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
};
// ---- StorageIndex → cudaDataType_t ------------------------------------------
template <typename StorageIndex>
struct cudss_index_type;
template <>
struct cudss_index_type<int> {
static constexpr cudaDataType_t value = CUDA_R_32I;
};
template <>
struct cudss_index_type<int64_t> {
static constexpr cudaDataType_t value = CUDA_R_64I;
};
// ---- UpLo → cudssMatrixViewType_t -------------------------------------------
// For symmetric matrices stored as CSC (ColMajor), cuDSS sees CSR of A^T.
// Since A = A^T, the data is the same, but the triangle view must be swapped.
template <int UpLo, int StorageOrder>
struct cudss_view_type;
// ColMajor (CSC) passed as CSR: lower ↔ upper swap.
template <>
struct cudss_view_type<Lower, ColMajor> {
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
};
template <>
struct cudss_view_type<Upper, ColMajor> {
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
};
// RowMajor (CSR) passed directly: no swap needed.
template <>
struct cudss_view_type<Lower, RowMajor> {
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
};
template <>
struct cudss_view_type<Upper, RowMajor> {
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
};
} // namespace internal
// ---- Ordering enum ----------------------------------------------------------
enum class GpuSparseOrdering {
AMD, // Default fill-reducing ordering
METIS, // METIS nested dissection
RCM // Reverse Cuthill-McKee
};
} // namespace Eigen
#endif // EIGEN_GPU_CUDSS_SUPPORT_H

View File

@@ -0,0 +1,103 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// cuFFT support utilities: error checking macro, type mapping.
#ifndef EIGEN_GPU_CUFFT_SUPPORT_H
#define EIGEN_GPU_CUFFT_SUPPORT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSupport.h"
#include <cufft.h>
namespace Eigen {
namespace internal {
// ---- Error checking ---------------------------------------------------------
#define EIGEN_CUFFT_CHECK(x) \
do { \
cufftResult _r = (x); \
eigen_assert(_r == CUFFT_SUCCESS && "cuFFT call failed: " #x); \
EIGEN_UNUSED_VARIABLE(_r); \
} while (0)
// ---- Scalar → cufftType traits ----------------------------------------------
template <typename Scalar>
struct cufft_c2c_type;
template <>
struct cufft_c2c_type<float> {
static constexpr cufftType value = CUFFT_C2C;
};
template <>
struct cufft_c2c_type<double> {
static constexpr cufftType value = CUFFT_Z2Z;
};
template <typename Scalar>
struct cufft_r2c_type;
template <>
struct cufft_r2c_type<float> {
static constexpr cufftType value = CUFFT_R2C;
};
template <>
struct cufft_r2c_type<double> {
static constexpr cufftType value = CUFFT_D2Z;
};
template <typename Scalar>
struct cufft_c2r_type;
template <>
struct cufft_c2r_type<float> {
static constexpr cufftType value = CUFFT_C2R;
};
template <>
struct cufft_c2r_type<double> {
static constexpr cufftType value = CUFFT_Z2D;
};
// ---- Type-dispatched cuFFT execution ----------------------------------------
// C2C
inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<float>* in, std::complex<float>* out,
int direction) {
return cufftExecC2C(plan, reinterpret_cast<cufftComplex*>(in), reinterpret_cast<cufftComplex*>(out), direction);
}
inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<double>* in, std::complex<double>* out,
int direction) {
return cufftExecZ2Z(plan, reinterpret_cast<cufftDoubleComplex*>(in), reinterpret_cast<cufftDoubleComplex*>(out),
direction);
}
// R2C
inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, float* in, std::complex<float>* out) {
return cufftExecR2C(plan, in, reinterpret_cast<cufftComplex*>(out));
}
inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, double* in, std::complex<double>* out) {
return cufftExecD2Z(plan, in, reinterpret_cast<cufftDoubleComplex*>(out));
}
// C2R
inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<float>* in, float* out) {
return cufftExecC2R(plan, reinterpret_cast<cufftComplex*>(in), out);
}
inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<double>* in, double* out) {
return cufftExecZ2D(plan, reinterpret_cast<cufftDoubleComplex*>(in), out);
}
} // namespace internal
} // namespace Eigen
#endif // EIGEN_GPU_CUFFT_SUPPORT_H

View File

@@ -91,6 +91,68 @@ struct cusolver_fill_mode<Upper, RowMajor> {
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER; static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
}; };
// ---- Type-specific cuSOLVER wrappers ----------------------------------------
// cuSOLVER does not provide generic X variants for ormqr/unmqr. These overloaded
// wrappers dispatch to the correct type-specific function.
// For real types: ormqr (orthogonal Q). For complex types: unmqr (unitary Q).
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
int n, int k, const float* A, int lda, const float* tau, float* C, int ldc,
float* work, int lwork, int* info) {
return cusolverDnSormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
}
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
int n, int k, const double* A, int lda, const double* tau, double* C, int ldc,
double* work, int lwork, int* info) {
return cusolverDnDormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
}
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
int n, int k, const std::complex<float>* A, int lda,
const std::complex<float>* tau, std::complex<float>* C, int ldc,
std::complex<float>* work, int lwork, int* info) {
return cusolverDnCunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<cuComplex*>(C), ldc,
reinterpret_cast<cuComplex*>(work), lwork, info);
}
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
int n, int k, const std::complex<double>* A, int lda,
const std::complex<double>* tau, std::complex<double>* C, int ldc,
std::complex<double>* work, int lwork, int* info) {
return cusolverDnZunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
reinterpret_cast<const cuDoubleComplex*>(tau), reinterpret_cast<cuDoubleComplex*>(C), ldc,
reinterpret_cast<cuDoubleComplex*>(work), lwork, info);
}
// Buffer size wrappers for ormqr/unmqr.
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
cublasOperation_t trans, int m, int n, int k, const float* A,
int lda, const float* tau, const float* C, int ldc, int* lwork) {
return cusolverDnSormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
}
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
cublasOperation_t trans, int m, int n, int k, const double* A,
int lda, const double* tau, const double* C, int ldc, int* lwork) {
return cusolverDnDormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
}
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
cublasOperation_t trans, int m, int n, int k,
const std::complex<float>* A, int lda,
const std::complex<float>* tau, const std::complex<float>* C,
int ldc, int* lwork) {
return cusolverDnCunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<const cuComplex*>(C),
ldc, lwork);
}
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
cublasOperation_t trans, int m, int n, int k,
const std::complex<double>* A, int lda,
const std::complex<double>* tau, const std::complex<double>* C,
int ldc, int* lwork) {
return cusolverDnZunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
reinterpret_cast<const cuDoubleComplex*>(tau),
reinterpret_cast<const cuDoubleComplex*>(C), ldc, lwork);
}
} // namespace internal } // namespace internal
} // namespace Eigen } // namespace Eigen

View File

@@ -0,0 +1,34 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// cuSPARSE support utilities: error checking macro.
#ifndef EIGEN_GPU_CUSPARSE_SUPPORT_H
#define EIGEN_GPU_CUSPARSE_SUPPORT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSupport.h"
#include <cusparse.h>
namespace Eigen {
namespace internal {
#define EIGEN_CUSPARSE_CHECK(x) \
do { \
cusparseStatus_t _s = (x); \
eigen_assert(_s == CUSPARSE_STATUS_SUCCESS && "cuSPARSE call failed: " #x); \
EIGEN_UNUSED_VARIABLE(_s); \
} while (0)
} // namespace internal
} // namespace Eigen
#endif // EIGEN_GPU_CUSPARSE_SUPPORT_H

View File

@@ -29,10 +29,11 @@ namespace Eigen {
namespace internal { namespace internal {
// ---- GEMM dispatch ---------------------------------------------------------- // ---- GEMM dispatch ----------------------------------------------------------
// GemmExpr<Lhs, Rhs> → cublasGemmEx(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) // GemmExpr<Lhs, Rhs> → cublasLtMatmul via GpuContext.
// //
// The generic API cublasGemmEx handles all scalar types (float, double, // Uses cublasLtMatmul for 64-bit dimension support and heuristic algorithm
// complex<float>, complex<double>) via cudaDataType_t. // selection. All scalar types (float, double, complex<float>, complex<double>)
// are handled via cudaDataType_t.
template <typename Lhs, typename Rhs> template <typename Lhs, typename Rhs>
void dispatch_gemm( void dispatch_gemm(
@@ -46,6 +47,10 @@ void dispatch_gemm(
const DeviceMatrix<Scalar>& A = traits_lhs::matrix(expr.lhs()); const DeviceMatrix<Scalar>& A = traits_lhs::matrix(expr.lhs());
const DeviceMatrix<Scalar>& B = traits_rhs::matrix(expr.rhs()); const DeviceMatrix<Scalar>& B = traits_rhs::matrix(expr.rhs());
// cuBLAS GEMM: C must not alias A or B (undefined behavior).
eigen_assert(dst.data() != A.data() && "GEMM: output aliases left operand (use a temporary)");
eigen_assert(dst.data() != B.data() && "GEMM: output aliases right operand (use a temporary)");
constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op); constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op);
constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op); constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op);
@@ -58,8 +63,8 @@ void dispatch_gemm(
eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch"); eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch");
const int64_t lda = A.outerStride(); const int64_t lda = A.rows();
const int64_t ldb = B.outerStride(); const int64_t ldb = B.rows();
// Serialize all accesses to the destination buffer on this stream. // Serialize all accesses to the destination buffer on this stream.
if (!dst.empty()) { if (!dst.empty()) {
@@ -71,9 +76,13 @@ void dispatch_gemm(
if (resized) { if (resized) {
dst.resize(m, n); dst.resize(m, n);
} }
const int64_t ldc = dst.outerStride(); const int64_t ldc = dst.rows();
Scalar alpha_val = alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs()); // cuBLAS requires alpha/beta as float for half/bfloat16 inputs.
using GemmScalar = typename cuda_gemm_scalar<Scalar>::type;
GemmScalar alpha_gval =
static_cast<GemmScalar>(alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs()));
GemmScalar beta_gval = static_cast<GemmScalar>(beta_val);
// Wait for operands to be ready on this stream. // Wait for operands to be ready on this stream.
A.waitReady(ctx.stream()); A.waitReady(ctx.stream());
@@ -81,17 +90,12 @@ void dispatch_gemm(
// If there is no existing valid destination to accumulate into, treat it as // If there is no existing valid destination to accumulate into, treat it as
// zero rather than reading uninitialized memory. // zero rather than reading uninitialized memory.
if (resized && beta_val != Scalar(0) && dst.sizeInBytes() > 0) { if (resized && beta_gval != GemmScalar(0) && dst.sizeInBytes() > 0) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream())); EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
} }
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value; cublaslt_gemm<Scalar>(ctx.cublasLtHandle(), ctx.cublasHandle(), transA, transB, m, n, k, &alpha_gval, A.data(), lda,
constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value; B.data(), ldb, &beta_gval, dst.data(), ldc, ctx.gemmWorkspace(), ctx.stream());
EIGEN_CUBLAS_CHECK(cublasGemmEx(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n),
static_cast<int>(k), &alpha_val, A.data(), dtype, static_cast<int>(lda), B.data(),
dtype, static_cast<int>(ldb), &beta_val, dst.data(), dtype, static_cast<int>(ldc),
compute, CUBLAS_GEMM_DEFAULT));
dst.recordReady(ctx.stream()); dst.recordReady(ctx.stream());
} }
@@ -125,9 +129,9 @@ void dispatch_llt_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LltSol
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value; constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo, ColMajor>::value; constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo, ColMajor>::value;
const int64_t lda = static_cast<int64_t>(A.outerStride()); const int64_t lda = static_cast<int64_t>(A.rows());
const int64_t ldb = static_cast<int64_t>(B.outerStride()); const int64_t ldb = static_cast<int64_t>(B.rows());
eigen_assert(ldb == static_cast<int64_t>(B.rows()) && "DeviceMatrix must be densely packed");
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar); const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar); const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
@@ -163,7 +167,7 @@ void dispatch_llt_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LltSol
// Solve. // Solve.
DeviceBuffer d_solve_info(sizeof(int)); DeviceBuffer d_solve_info(sizeof(int));
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), nrhs, dtype, EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), nrhs, dtype,
d_factor.ptr, lda, dtype, dst.data(), static_cast<int64_t>(dst.outerStride()), d_factor.ptr, lda, dtype, dst.data(), static_cast<int64_t>(dst.rows()),
static_cast<int*>(d_solve_info.ptr))); static_cast<int*>(d_solve_info.ptr)));
// Sync to ensure workspace locals can be freed safely. // Sync to ensure workspace locals can be freed safely.
@@ -201,9 +205,9 @@ void dispatch_lu_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LuSolve
if (!dst.empty()) dst.waitReady(ctx.stream()); if (!dst.empty()) dst.waitReady(ctx.stream());
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value; constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
const int64_t lda = static_cast<int64_t>(A.outerStride()); const int64_t lda = static_cast<int64_t>(A.rows());
const int64_t ldb = static_cast<int64_t>(B.outerStride()); const int64_t ldb = static_cast<int64_t>(B.rows());
eigen_assert(ldb == static_cast<int64_t>(B.rows()) && "DeviceMatrix must be densely packed");
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar); const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar); const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t); const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t);
@@ -245,7 +249,7 @@ void dispatch_lu_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LuSolve
DeviceBuffer d_solve_info(sizeof(int)); DeviceBuffer d_solve_info(sizeof(int));
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, static_cast<int64_t>(n), nrhs, EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, static_cast<int64_t>(n), nrhs,
dtype, d_lu.ptr, lda, static_cast<const int64_t*>(d_ipiv.ptr), dtype, dtype, d_lu.ptr, lda, static_cast<const int64_t*>(d_ipiv.ptr), dtype,
dst.data(), static_cast<int64_t>(dst.outerStride()), dst.data(), static_cast<int64_t>(dst.rows()),
static_cast<int*>(d_solve_info.ptr))); static_cast<int*>(d_solve_info.ptr)));
// Sync to ensure workspace locals can be freed safely. // Sync to ensure workspace locals can be freed safely.
@@ -285,15 +289,15 @@ void dispatch_trsm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const TrsmExpr<Sc
// D2D copy B → dst (trsm is in-place on the RHS). // D2D copy B → dst (trsm is in-place on the RHS).
dst.resize(n, B.cols()); dst.resize(n, B.cols());
const size_t rhs_bytes = static_cast<size_t>(dst.outerStride()) * static_cast<size_t>(nrhs) * sizeof(Scalar); const size_t rhs_bytes = static_cast<size_t>(dst.rows()) * static_cast<size_t>(nrhs) * sizeof(Scalar);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream())); EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
Scalar alpha(1); Scalar alpha(1);
EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs, EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs,
&alpha, A.data(), static_cast<int>(A.outerStride()), dst.data(), &alpha, A.data(), static_cast<int>(A.rows()), dst.data(),
static_cast<int>(dst.outerStride()))); static_cast<int>(dst.rows())));
dst.recordReady(ctx.stream()); dst.recordReady(ctx.stream());
} }
@@ -329,8 +333,8 @@ void dispatch_symm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SymmExpr<Sc
Scalar alpha(1), beta(0); Scalar alpha(1), beta(0);
EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &alpha, A.data(), EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &alpha, A.data(),
static_cast<int>(A.outerStride()), B.data(), static_cast<int>(B.outerStride()), &beta, static_cast<int>(A.rows()), B.data(), static_cast<int>(B.rows()), &beta, dst.data(),
dst.data(), static_cast<int>(dst.outerStride()))); static_cast<int>(dst.rows())));
dst.recordReady(ctx.stream()); dst.recordReady(ctx.stream());
} }
@@ -367,8 +371,7 @@ void dispatch_syrk(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SyrkExpr<Sc
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(), EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(),
static_cast<int>(A.outerStride()), &beta_val, dst.data(), static_cast<int>(A.rows()), &beta_val, dst.data(), static_cast<int>(dst.rows())));
static_cast<int>(dst.outerStride())));
dst.recordReady(ctx.stream()); dst.recordReady(ctx.stream());
} }
@@ -501,6 +504,284 @@ void DeviceSelfAdjointView<Scalar_, UpLo_>::rankUpdate(const DeviceMatrix<Scalar
internal::dispatch_syrk(GpuContext::threadLocal(), matrix(), expr, alpha, beta); internal::dispatch_syrk(GpuContext::threadLocal(), matrix(), expr, alpha, beta);
} }
// ---- DeviceMatrix BLAS-1 out-of-line definitions ----------------------------
// Defined here because they need the full GpuContext definition.
// All methods take an explicit GpuContext& so callers can ensure same-stream
// execution (zero event overhead when all operations share one context).
//
// Reduction methods (dot, norm, squaredNorm) use CUBLAS_POINTER_MODE_HOST:
// the scalar result is written to host memory and cuBLAS synchronizes
// internally before returning. This is necessary for Eigen template
// compatibility — CG does `Scalar alpha = absNew / p.dot(tmp)` which
// requires the host value immediately. A future GPU CG implementation
// that controls the iteration loop can use CUBLAS_POINTER_MODE_DEVICE
// to batch multiple reductions into a single sync point.
template <typename Scalar_>
DeviceScalar<typename DeviceMatrix<Scalar_>::Scalar> DeviceMatrix<Scalar_>::dot(GpuContext& ctx,
const DeviceMatrix& other) const {
const int n = static_cast<int>(rows_ * cols_);
eigen_assert(n == static_cast<int>(other.rows_ * other.cols_));
DeviceScalar<Scalar> result(Scalar(0), ctx.stream());
if (n > 0) {
waitReady(ctx.stream());
other.waitReady(ctx.stream());
cublasPointerMode_t prev;
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
EIGEN_CUBLAS_CHECK(internal::cublasXdot(ctx.cublasHandle(), n, data_, 1, other.data_, 1, result.devicePtr()));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
}
return result;
}
namespace internal {
// Real: dot(x,x) returns DeviceScalar<Scalar> which IS DeviceScalar<RealScalar>.
// Move-construct without any sync.
template <typename Scalar, typename RealScalar>
typename std::enable_if<std::is_same<Scalar, RealScalar>::value, DeviceScalar<RealScalar>>::type squaredNorm_from_dot(
DeviceScalar<Scalar>&& d, cudaStream_t) {
return std::move(d);
}
// Complex: must sync to extract the real part (DeviceScalar arithmetic is real-only).
template <typename Scalar, typename RealScalar>
typename std::enable_if<!std::is_same<Scalar, RealScalar>::value, DeviceScalar<RealScalar>>::type squaredNorm_from_dot(
DeviceScalar<Scalar>&& d, cudaStream_t stream) {
return DeviceScalar<RealScalar>(numext::real(Scalar(d)), stream);
}
} // namespace internal
template <typename Scalar_>
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::squaredNorm(GpuContext& ctx) const {
// Use dot(x,x) instead of nrm2()^2: dot kernel is ~4.5x faster than nrm2
// (nrm2 uses a numerically careful scaled-sum-of-squares algorithm that is
// unnecessary for CG convergence checks).
using RealScalar = typename NumTraits<Scalar_>::Real;
return internal::squaredNorm_from_dot<Scalar_, RealScalar>(dot(ctx, *this), ctx.stream());
}
template <typename Scalar_>
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::norm(GpuContext& ctx) const {
using RealScalar = typename NumTraits<Scalar>::Real;
const int n = static_cast<int>(rows_ * cols_);
DeviceScalar<RealScalar> result(RealScalar(0), ctx.stream());
if (n > 0) {
waitReady(ctx.stream());
cublasPointerMode_t prev;
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
EIGEN_CUBLAS_CHECK(internal::cublasXnrm2(ctx.cublasHandle(), n, data_, 1, result.devicePtr()));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
}
return result;
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::setZero(GpuContext& ctx) {
if (sizeInBytes() > 0) {
waitReady(ctx.stream());
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(data_, 0, sizeInBytes(), ctx.stream()));
recordReady(ctx.stream());
}
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::addScaled(GpuContext& ctx, Scalar alpha, const DeviceMatrix& x) {
const int n = static_cast<int>(rows_ * cols_);
eigen_assert(n == static_cast<int>(x.rows_ * x.cols_));
if (n > 0) {
waitReady(ctx.stream());
x.waitReady(ctx.stream());
EIGEN_CUBLAS_CHECK(internal::cublasXaxpy(ctx.cublasHandle(), n, &alpha, x.data_, 1, data_, 1));
recordReady(ctx.stream());
}
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::scale(GpuContext& ctx, Scalar alpha) {
const int n = static_cast<int>(rows_ * cols_);
if (n > 0) {
waitReady(ctx.stream());
EIGEN_CUBLAS_CHECK(internal::cublasXscal(ctx.cublasHandle(), n, &alpha, data_, 1));
recordReady(ctx.stream());
}
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::copyFrom(GpuContext& ctx, const DeviceMatrix& other) {
// Wait on *this before resize — resize may free the old buffer while another
// stream is still reading it.
if (!empty()) waitReady(ctx.stream());
resize(other.rows_, other.cols_);
const int n = static_cast<int>(rows_ * cols_);
if (n > 0) {
other.waitReady(ctx.stream());
EIGEN_CUBLAS_CHECK(internal::cublasXcopy(ctx.cublasHandle(), n, other.data_, 1, data_, 1));
recordReady(ctx.stream());
}
}
// ---- BLAS-1 operator overloads for CG compatibility -------------------------
// this += alpha * x (axpy)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceScaled<DeviceMatrix>& expr) {
addScaled(GpuContext::threadLocal(), expr.scalar(), internal::device_expr_traits<DeviceMatrix>::matrix(expr.inner()));
return *this;
}
// this -= alpha * x (axpy with negated alpha)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceScaled<DeviceMatrix>& expr) {
addScaled(GpuContext::threadLocal(), -expr.scalar(),
internal::device_expr_traits<DeviceMatrix>::matrix(expr.inner()));
return *this;
}
// this += x (axpy with alpha=1)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceMatrix& other) {
Scalar one(1);
addScaled(GpuContext::threadLocal(), one, other);
return *this;
}
// this -= x (axpy with alpha=-1)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceMatrix& other) {
Scalar neg_one(-1);
addScaled(GpuContext::threadLocal(), neg_one, other);
return *this;
}
// this *= alpha (scal, host pointer)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator*=(Scalar alpha) {
scale(GpuContext::threadLocal(), alpha);
return *this;
}
// this *= alpha (scal, device pointer — avoids host sync)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator*=(const DeviceScalar<Scalar>& alpha) {
const int n = static_cast<int>(rows_ * cols_);
if (n > 0) {
auto& ctx = GpuContext::threadLocal();
waitReady(ctx.stream());
cublasPointerMode_t prev;
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
EIGEN_CUBLAS_CHECK(internal::cublasXscal(ctx.cublasHandle(), n, alpha.devicePtr(), data_, 1));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
recordReady(ctx.stream());
}
return *this;
}
// this += DeviceScalar * x (axpy with CUBLAS_POINTER_MODE_DEVICE)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceScaledDevice<Scalar_>& expr) {
const int n = static_cast<int>(rows_ * cols_);
const auto& x = expr.matrix();
eigen_assert(n == static_cast<int>(x.rows_ * x.cols_));
if (n > 0) {
auto& ctx = GpuContext::threadLocal();
waitReady(ctx.stream());
x.waitReady(ctx.stream());
cublasPointerMode_t prev;
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
EIGEN_CUBLAS_CHECK(internal::cublasXaxpy(ctx.cublasHandle(), n, expr.alpha().devicePtr(), x.data_, 1, data_, 1));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
recordReady(ctx.stream());
}
return *this;
}
// this -= DeviceScalar * x (axpy with negated device scalar)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceScaledDevice<Scalar_>& expr) {
auto neg_alpha = -expr.alpha();
DeviceScaledDevice<Scalar_> neg_expr(neg_alpha, expr.matrix());
return operator+=(neg_expr);
}
// this = alpha * A + beta * B (cuBLAS geam)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const DeviceAddExpr<Scalar_>& expr) {
auto& ctx = GpuContext::threadLocal();
const auto& A = expr.A();
const auto& B = expr.B();
eigen_assert(A.rows() == B.rows() && A.cols() == B.cols());
const int m = static_cast<int>(A.rows());
const int n = static_cast<int>(A.cols());
// Wait on *this before resize — resize may free the old buffer while another
// stream is still reading it.
if (!empty()) waitReady(ctx.stream());
resize(A.rows(), A.cols());
if (m > 0 && n > 0) {
A.waitReady(ctx.stream());
B.waitReady(ctx.stream());
Scalar_ alpha = expr.alpha();
Scalar_ beta = expr.beta();
EIGEN_CUBLAS_CHECK(internal::cublasXgeam(ctx.cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, &alpha, A.data(), m,
&beta, B.data(), m, data_, m));
recordReady(ctx.stream());
}
return *this;
}
// cwiseProduct via NPP nppsMul (allocating).
template <typename Scalar_>
DeviceMatrix<Scalar_> DeviceMatrix<Scalar_>::cwiseProduct(GpuContext& ctx, const DeviceMatrix& other) const {
const int n = static_cast<int>(rows_ * cols_);
eigen_assert(n == static_cast<int>(other.rows_ * other.cols_));
DeviceMatrix result(rows_, cols_);
if (n > 0) {
waitReady(ctx.stream());
other.waitReady(ctx.stream());
internal::device_cwiseProduct(data_, other.data_, result.data_, n, ctx.stream());
result.recordReady(ctx.stream());
}
return result;
}
// In-place cwiseProduct: this = a .* b (reuses this buffer, no allocation).
template <typename Scalar_>
void DeviceMatrix<Scalar_>::cwiseProduct(GpuContext& ctx, const DeviceMatrix& a, const DeviceMatrix& b) {
const int n = static_cast<int>(a.rows_ * a.cols_);
eigen_assert(n == static_cast<int>(b.rows_ * b.cols_));
if (!empty()) waitReady(ctx.stream());
resize(a.rows_, a.cols_);
if (n > 0) {
a.waitReady(ctx.stream());
b.waitReady(ctx.stream());
internal::device_cwiseProduct(a.data_, b.data_, data_, n, ctx.stream());
recordReady(ctx.stream());
}
}
// Convenience overloads using thread-local default GpuContext.
template <typename Scalar_>
DeviceScalar<typename DeviceMatrix<Scalar_>::Scalar> DeviceMatrix<Scalar_>::dot(const DeviceMatrix& other) const {
return dot(GpuContext::threadLocal(), other);
}
template <typename Scalar_>
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::squaredNorm() const {
return squaredNorm(GpuContext::threadLocal());
}
template <typename Scalar_>
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::norm() const {
return norm(GpuContext::threadLocal());
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::setZero() {
setZero(GpuContext::threadLocal());
}
} // namespace Eigen } // namespace Eigen
#endif // EIGEN_GPU_DEVICE_DISPATCH_H #endif // EIGEN_GPU_DEVICE_DISPATCH_H

View File

@@ -219,6 +219,87 @@ DeviceScaled<DeviceTransposeView<S>> operator*(S alpha, const DeviceTransposeVie
return {alpha, m}; return {alpha, m};
} }
// ---- DeviceScaledDevice: DeviceScalar * DeviceMatrix → device-pointer axpy ---
// Like DeviceScaled but carries a DeviceScalar (device pointer) instead of
// a host scalar. operator+= dispatches to cuBLAS axpy with POINTER_MODE_DEVICE.
template <typename Scalar_>
class DeviceScaledDevice {
public:
using Scalar = Scalar_;
DeviceScaledDevice(const DeviceScalar<Scalar>& alpha, const DeviceMatrix<Scalar>& mat) : alpha_(alpha), mat_(mat) {}
const DeviceScalar<Scalar>& alpha() const { return alpha_; }
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
private:
const DeviceScalar<Scalar>& alpha_;
const DeviceMatrix<Scalar>& mat_;
};
// DeviceScalar * DeviceMatrix → DeviceScaledDevice
template <typename S>
DeviceScaledDevice<S> operator*(const DeviceScalar<S>& alpha, const DeviceMatrix<S>& m) {
return {alpha, m};
}
// ---- DeviceAddExpr: a + b → cublasXgeam -------------------------------------
// Captures `DeviceMatrix + DeviceScaled<DeviceMatrix>` (and reverse).
// Dispatched to geam: C = alpha * A + beta * B.
//
// Note: These operator+/- overloads are intentionally free functions on
// DeviceMatrix, not Eigen expression templates. DeviceMatrix does not inherit
// from MatrixBase, so there is no ambiguity with Eigen's own operator+/-.
// If DeviceMatrix is ever made an Eigen expression type, these would need to
// be revisited.
template <typename Scalar_>
class DeviceAddExpr {
public:
using Scalar = Scalar_;
DeviceAddExpr(Scalar alpha, const DeviceMatrix<Scalar>& A, Scalar beta, const DeviceMatrix<Scalar>& B)
: alpha_(alpha), A_(A), beta_(beta), B_(B) {}
Scalar alpha() const { return alpha_; }
Scalar beta() const { return beta_; }
const DeviceMatrix<Scalar>& A() const { return A_; }
const DeviceMatrix<Scalar>& B() const { return B_; }
private:
Scalar alpha_;
const DeviceMatrix<Scalar>& A_;
Scalar beta_;
const DeviceMatrix<Scalar>& B_;
};
// DeviceMatrix + DeviceMatrix → DeviceAddExpr (alpha=1, beta=1)
template <typename S>
DeviceAddExpr<S> operator+(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
return {S(1), a, S(1), b};
}
// DeviceMatrix + DeviceScaled<DeviceMatrix> → DeviceAddExpr (alpha=1, beta=scaled)
template <typename S>
DeviceAddExpr<S> operator+(const DeviceMatrix<S>& a, const DeviceScaled<DeviceMatrix<S>>& b) {
return {S(1), a, b.scalar(), b.inner()};
}
// DeviceScaled<DeviceMatrix> + DeviceMatrix → DeviceAddExpr (alpha=scaled, beta=1)
template <typename S>
DeviceAddExpr<S> operator+(const DeviceScaled<DeviceMatrix<S>>& a, const DeviceMatrix<S>& b) {
return {a.scalar(), a.inner(), S(1), b};
}
// DeviceMatrix - DeviceMatrix → DeviceAddExpr (alpha=1, beta=-1)
template <typename S>
DeviceAddExpr<S> operator-(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
return {S(1), a, S(-1), b};
}
// DeviceMatrix - DeviceScaled<DeviceMatrix> → DeviceAddExpr (alpha=1, beta=-scaled)
template <typename S>
DeviceAddExpr<S> operator-(const DeviceMatrix<S>& a, const DeviceScaled<DeviceMatrix<S>>& b) {
return {S(1), a, -b.scalar(), b.inner()};
}
} // namespace Eigen } // namespace Eigen
#endif // EIGEN_GPU_DEVICE_EXPR_H #endif // EIGEN_GPU_DEVICE_EXPR_H

View File

@@ -10,7 +10,7 @@
// Typed RAII wrapper for a dense matrix in GPU device memory. // Typed RAII wrapper for a dense matrix in GPU device memory.
// //
// DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked // DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked
// dimensions and leading dimension. It can be passed to GPU solvers // dimensions. Always dense (leading dimension = rows). It can be passed to GPU solvers
// (GpuLLT, GpuLU, future cuBLAS/cuDSS) without host round-trips. // (GpuLLT, GpuLU, future cuBLAS/cuDSS) without host round-trips.
// //
// Cross-stream safety is automatic: an internal CUDA event tracks when the // Cross-stream safety is automatic: an internal CUDA event tracks when the
@@ -25,7 +25,7 @@
// MatrixXd X = d_X.toHost(); // download + block // MatrixXd X = d_X.toHost(); // download + block
// //
// Async variants: // Async variants:
// auto d_A = DeviceMatrix<double>::fromHostAsync(A.data(), n, n, n, stream); // auto d_A = DeviceMatrix<double>::fromHostAsync(A.data(), n, n, stream);
// auto transfer = d_X.toHostAsync(stream); // enqueue D2H // auto transfer = d_X.toHostAsync(stream); // enqueue D2H
// // ... overlap with other work ... // // ... overlap with other work ...
// MatrixXd X = transfer.get(); // block + retrieve // MatrixXd X = transfer.get(); // block + retrieve
@@ -53,6 +53,16 @@ template <typename>
class DeviceAssignment; class DeviceAssignment;
template <typename, typename> template <typename, typename>
class GemmExpr; class GemmExpr;
template <typename>
class DeviceScaled;
template <typename>
class SpMVExpr;
template <typename>
class DeviceAddExpr;
template <typename>
class DeviceScaledDevice;
template <typename>
class DeviceScalar;
template <typename, int> template <typename, int>
class LltSolveExpr; class LltSolveExpr;
template <typename> template <typename>
@@ -157,7 +167,8 @@ class HostTransfer {
* *
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double> * \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
* *
* Owns a device allocation with tracked dimensions and leading dimension. * Owns a device allocation with tracked dimensions. Always dense
* (leading dimension = rows; no stride padding).
* An internal CUDA event records when the data was last written, enabling * An internal CUDA event records when the data was last written, enabling
* safe cross-stream consumption without user-visible synchronization. * safe cross-stream consumption without user-visible synchronization.
* *
@@ -169,6 +180,8 @@ template <typename Scalar_>
class DeviceMatrix { class DeviceMatrix {
public: public:
using Scalar = Scalar_; using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using PlainObject = DeviceMatrix; // owning type (for CG template compatibility)
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>; using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
// ---- Construction / destruction ------------------------------------------ // ---- Construction / destruction ------------------------------------------
@@ -176,8 +189,18 @@ class DeviceMatrix {
/** Default: empty (0x0, no allocation). */ /** Default: empty (0x0, no allocation). */
DeviceMatrix() = default; DeviceMatrix() = default;
/** Allocate uninitialized column vector of given size.
* Matches Matrix<Scalar,Dynamic,1>(n) for CG template compatibility. */
explicit DeviceMatrix(Index n) : rows_(n), cols_(1) {
eigen_assert(n >= 0);
size_t bytes = sizeInBytes();
if (bytes > 0) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
}
}
/** Allocate uninitialized device memory for a rows x cols matrix. */ /** Allocate uninitialized device memory for a rows x cols matrix. */
DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols), outerStride_(rows) { DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols) {
eigen_assert(rows >= 0 && cols >= 0); eigen_assert(rows >= 0 && cols >= 0);
size_t bytes = sizeInBytes(); size_t bytes = sizeInBytes();
if (bytes > 0) { if (bytes > 0) {
@@ -196,14 +219,12 @@ class DeviceMatrix {
: data_(o.data_), : data_(o.data_),
rows_(o.rows_), rows_(o.rows_),
cols_(o.cols_), cols_(o.cols_),
outerStride_(o.outerStride_),
ready_event_(o.ready_event_), ready_event_(o.ready_event_),
ready_stream_(o.ready_stream_), ready_stream_(o.ready_stream_),
retained_buffer_(std::move(o.retained_buffer_)) { retained_buffer_(std::move(o.retained_buffer_)) {
o.data_ = nullptr; o.data_ = nullptr;
o.rows_ = 0; o.rows_ = 0;
o.cols_ = 0; o.cols_ = 0;
o.outerStride_ = 0;
o.ready_event_ = nullptr; o.ready_event_ = nullptr;
o.ready_stream_ = nullptr; o.ready_stream_ = nullptr;
} }
@@ -215,14 +236,12 @@ class DeviceMatrix {
data_ = o.data_; data_ = o.data_;
rows_ = o.rows_; rows_ = o.rows_;
cols_ = o.cols_; cols_ = o.cols_;
outerStride_ = o.outerStride_;
ready_event_ = o.ready_event_; ready_event_ = o.ready_event_;
ready_stream_ = o.ready_stream_; ready_stream_ = o.ready_stream_;
retained_buffer_ = std::move(o.retained_buffer_); retained_buffer_ = std::move(o.retained_buffer_);
o.data_ = nullptr; o.data_ = nullptr;
o.rows_ = 0; o.rows_ = 0;
o.cols_ = 0; o.cols_ = 0;
o.outerStride_ = 0;
o.ready_event_ = nullptr; o.ready_event_ = nullptr;
o.ready_stream_ = nullptr; o.ready_stream_ = nullptr;
} }
@@ -262,26 +281,14 @@ class DeviceMatrix {
* \param host_data Pointer to contiguous column-major host data. * \param host_data Pointer to contiguous column-major host data.
* \param rows Number of rows. * \param rows Number of rows.
* \param cols Number of columns. * \param cols Number of columns.
* \param outerStride Leading dimension (>= rows). Use rows for dense.
* \param stream CUDA stream for the transfer. * \param stream CUDA stream for the transfer.
*/ */
static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, Index outerStride, static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, cudaStream_t stream) {
cudaStream_t stream) { eigen_assert(rows >= 0 && cols >= 0);
eigen_assert(rows >= 0 && cols >= 0 && outerStride >= rows);
eigen_assert(host_data != nullptr || (rows == 0 || cols == 0)); eigen_assert(host_data != nullptr || (rows == 0 || cols == 0));
DeviceMatrix dm(rows, cols); DeviceMatrix dm(rows, cols);
if (dm.sizeInBytes() > 0) { if (dm.sizeInBytes() > 0) {
// If outerStride == rows (dense), single contiguous copy. EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dm.data_, host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
// Otherwise, copy column by column (strided layout).
if (outerStride == rows) {
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(dm.data_, host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
} else {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(dm.data_, static_cast<size_t>(rows) * sizeof(Scalar), host_data,
static_cast<size_t>(outerStride) * sizeof(Scalar),
static_cast<size_t>(rows) * sizeof(Scalar),
static_cast<size_t>(cols), cudaMemcpyHostToDevice, stream));
}
dm.recordReady(stream); dm.recordReady(stream);
} }
return dm; return dm;
@@ -360,7 +367,6 @@ class DeviceMatrix {
retained_buffer_ = internal::DeviceBuffer(); retained_buffer_ = internal::DeviceBuffer();
rows_ = rows; rows_ = rows;
cols_ = cols; cols_ = cols;
outerStride_ = rows;
size_t bytes = sizeInBytes(); size_t bytes = sizeInBytes();
if (bytes > 0) { if (bytes > 0) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes)); EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
@@ -373,11 +379,10 @@ class DeviceMatrix {
const Scalar* data() const { return data_; } const Scalar* data() const { return data_; }
Index rows() const { return rows_; } Index rows() const { return rows_; }
Index cols() const { return cols_; } Index cols() const { return cols_; }
Index outerStride() const { return outerStride_; }
bool empty() const { return rows_ == 0 || cols_ == 0; } bool empty() const { return rows_ == 0 || cols_ == 0; }
/** Size of the device allocation in bytes. */ /** Size of the device allocation in bytes. */
size_t sizeInBytes() const { return static_cast<size_t>(outerStride_) * static_cast<size_t>(cols_) * sizeof(Scalar); } size_t sizeInBytes() const { return static_cast<size_t>(rows_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
// ---- Event synchronization (public for library dispatch interop) --------- // ---- Event synchronization (public for library dispatch interop) ---------
@@ -463,11 +468,108 @@ class DeviceMatrix {
template <int UpLo> template <int UpLo>
DeviceMatrix& operator=(const SymmExpr<Scalar, UpLo>& expr); DeviceMatrix& operator=(const SymmExpr<Scalar, UpLo>& expr);
// ---- BLAS Level-1 operations ----------------------------------------------
// DeviceMatrix is always dense (lda == rows), so a vector is simply a
// DeviceMatrix with cols == 1. These BLAS-1 methods operate on the flat
// rows*cols element array, making them work for both vectors and matrices.
//
// All methods take an explicit GpuContext& for stream/handle control.
// When everything uses the same context, event waits are skipped (same-stream).
// Defined out-of-line in DeviceDispatch.h (needs GpuContext).
/** Dot product: this^H * other. Returns DeviceScalar — the result stays
* on device until read via implicit conversion to Scalar (which syncs).
* When used with `auto`, no sync occurs until the value is needed. */
DeviceScalar<Scalar> dot(GpuContext& ctx, const DeviceMatrix& other) const;
/** Squared L2 norm via dot(x, x). Returns DeviceScalar (no sync until read).
* For real types, the result stays on device. For complex types, falls back
* to host sync (DeviceScalar arithmetic is real-only). */
DeviceScalar<typename NumTraits<Scalar>::Real> squaredNorm(GpuContext& ctx) const;
/** L2 norm. Returns DeviceScalar (no host sync). */
DeviceScalar<typename NumTraits<Scalar>::Real> norm(GpuContext& ctx) const;
/** Set all elements to zero. */
void setZero(GpuContext& ctx);
/** this += alpha * x (cuBLAS axpy). Requires same total size. */
void addScaled(GpuContext& ctx, Scalar alpha, const DeviceMatrix& x);
/** this *= alpha (cuBLAS scal). */
void scale(GpuContext& ctx, Scalar alpha);
/** Deep copy: this = other (cuBLAS copy). Resizes if needed. */
void copyFrom(GpuContext& ctx, const DeviceMatrix& other);
// Convenience overloads using the thread-local default GpuContext.
DeviceScalar<Scalar> dot(const DeviceMatrix& other) const;
DeviceScalar<typename NumTraits<Scalar>::Real> squaredNorm() const;
DeviceScalar<typename NumTraits<Scalar>::Real> norm() const;
void setZero();
// ---- BLAS-1 operator overloads for CG/iterative solver compatibility ------
// These allow CG code like `x += alpha * p` to work with DeviceMatrix.
// `alpha * DeviceMatrix` already returns `DeviceScaled<DeviceMatrix<Scalar>>`
// (defined in DeviceExpr.h). These operators dispatch to cuBLAS axpy/scal.
// Defined out-of-line in DeviceDispatch.h.
/** this += alpha * x (cuBLAS axpy). For `x += alpha * p`. */
DeviceMatrix& operator+=(const DeviceScaled<DeviceMatrix>& expr);
/** this -= alpha * x (cuBLAS axpy with negated alpha). For `r -= alpha * tmp`. */
DeviceMatrix& operator-=(const DeviceScaled<DeviceMatrix>& expr);
/** this += x (cuBLAS axpy with alpha=1). */
DeviceMatrix& operator+=(const DeviceMatrix& other);
/** this -= x (cuBLAS axpy with alpha=-1). */
DeviceMatrix& operator-=(const DeviceMatrix& other);
/** this *= alpha (cuBLAS scal, host pointer mode). For `p *= beta`. */
DeviceMatrix& operator*=(Scalar alpha);
/** this *= alpha (cuBLAS scal, device pointer mode). Avoids host sync. */
DeviceMatrix& operator*=(const DeviceScalar<Scalar>& alpha);
/** Element-wise product: result[i] = this[i] * other[i] (NPP nppsMul).
* Returns a new DeviceMatrix. Defined out-of-line in DeviceDispatch.h. */
DeviceMatrix cwiseProduct(GpuContext& ctx, const DeviceMatrix& other) const;
/** In-place element-wise product: this[i] = a[i] * b[i] (NPP nppsMul).
* Reuses this matrix's buffer when sizes match, avoiding cudaMalloc. */
void cwiseProduct(GpuContext& ctx, const DeviceMatrix& a, const DeviceMatrix& b);
/** this += DeviceScalar * x (cuBLAS axpy with POINTER_MODE_DEVICE). */
DeviceMatrix& operator+=(const DeviceScaledDevice<Scalar>& expr);
/** this -= DeviceScalar * x (cuBLAS axpy with negated device scalar). */
DeviceMatrix& operator-=(const DeviceScaledDevice<Scalar>& expr);
/** Assign from an SpMV expression: d_y = d_A * d_x. */
DeviceMatrix& operator=(const SpMVExpr<Scalar>& expr);
/** Assign from an add expression: d_C = alpha * d_A + beta * d_B (cuBLAS geam). */
DeviceMatrix& operator=(const DeviceAddExpr<Scalar>& expr);
/** No-op — all DeviceMatrix operations are implicitly noalias.
*
* Unlike Eigen's Matrix, where omitting .noalias() triggers a copy to a
* temporary for safety, DeviceMatrix dispatches directly to NVIDIA library
* calls which have no built-in aliasing protection. Every assignment
* (`d_C = d_A * d_B`, `d_y = d_A * d_x`, etc.) behaves as if .noalias()
* were specified. The caller must ensure operands don't alias the
* destination for GEMM and SpMV. geam (`d_C = d_A + alpha * d_B`) is
* safe with aliasing. Debug asserts catch violations.
*
* This method exists so that `tmp.noalias() = mat * p` compiles for both
* Matrix and DeviceMatrix. */
DeviceMatrix& noalias() { return *this; }
private: private:
// ---- Private: adopt a raw device pointer (used by friend solvers) -------- // ---- Private: adopt a raw device pointer (used by friend solvers) --------
DeviceMatrix(Scalar* device_ptr, Index rows, Index cols, Index outerStride) DeviceMatrix(Scalar* device_ptr, Index rows, Index cols) : data_(device_ptr), rows_(rows), cols_(cols) {}
: data_(device_ptr), rows_(rows), cols_(cols), outerStride_(outerStride) {}
/** Transfer ownership of the device pointer out. Zeros internal state. */ /** Transfer ownership of the device pointer out. Zeros internal state. */
Scalar* release() { Scalar* release() {
@@ -475,7 +577,6 @@ class DeviceMatrix {
data_ = nullptr; data_ = nullptr;
rows_ = 0; rows_ = 0;
cols_ = 0; cols_ = 0;
outerStride_ = 0;
if (ready_event_) { if (ready_event_) {
(void)cudaEventDestroy(ready_event_); (void)cudaEventDestroy(ready_event_);
ready_event_ = nullptr; ready_event_ = nullptr;
@@ -500,13 +601,18 @@ class DeviceMatrix {
friend class GpuLLT; friend class GpuLLT;
template <typename> template <typename>
friend class GpuLU; friend class GpuLU;
template <typename>
friend class GpuQR;
template <typename>
friend class GpuSVD;
template <typename>
friend class GpuSelfAdjointEigenSolver;
// ---- Data members -------------------------------------------------------- // ---- Data members --------------------------------------------------------
Scalar* data_ = nullptr; Scalar* data_ = nullptr;
Index rows_ = 0; Index rows_ = 0;
Index cols_ = 0; Index cols_ = 0;
Index outerStride_ = 0;
cudaEvent_t ready_event_ = nullptr; // internal: tracks last write completion cudaEvent_t ready_event_ = nullptr; // internal: tracks last write completion
cudaStream_t ready_stream_ = nullptr; // stream that recorded ready_event_ (for same-stream skip) cudaStream_t ready_stream_ = nullptr; // stream that recorded ready_event_ (for same-stream skip)
internal::DeviceBuffer retained_buffer_; // internal: keeps async aux buffers alive internal::DeviceBuffer retained_buffer_; // internal: keeps async aux buffers alive

View File

@@ -0,0 +1,121 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Device-resident scalar for deferred host synchronization.
//
// DeviceScalar<Scalar> wraps a single value in device memory. Reductions
// (dot, nrm2) write results directly to device memory via
// CUBLAS_POINTER_MODE_DEVICE, deferring host sync until the value is read.
//
// Implicit conversion to Scalar triggers cudaStreamSynchronize + download.
// In CG, this reduces 3 syncs/iter to effectively 1: the first conversion
// syncs the stream, subsequent conversions in the same expression just
// download (the stream is already flushed).
//
// Usage:
// auto dot_val = d_x.dot(d_y); // DeviceScalar, no sync
// auto norm_val = d_r.squaredNorm(); // DeviceScalar, no sync
// Scalar alpha = absNew / dot_val; // sync here (both values downloaded)
// d_x += alpha * d_p; // host-scalar axpy (as before)
#ifndef EIGEN_GPU_DEVICE_SCALAR_H
#define EIGEN_GPU_DEVICE_SCALAR_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSupport.h"
#include "./DeviceScalarOps.h"
namespace Eigen {
template <typename Scalar_>
class DeviceScalar {
public:
using Scalar = Scalar_;
/** Allocate uninitialized device scalar. Contents are undefined until written
* (e.g., by cuBLAS dot/nrm2 with POINTER_MODE_DEVICE). Consistent with
* DeviceMatrix(rows, cols) which also does not zero-initialize. */
explicit DeviceScalar(cudaStream_t stream = nullptr) : d_val_(sizeof(Scalar)), stream_(stream) {}
DeviceScalar(Scalar host_val, cudaStream_t stream) : d_val_(sizeof(Scalar)), stream_(stream) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_val_.ptr, &host_val, sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
}
DeviceScalar(DeviceScalar&& o) noexcept : d_val_(std::move(o.d_val_)), stream_(o.stream_) { o.stream_ = nullptr; }
DeviceScalar& operator=(DeviceScalar&& o) noexcept {
if (this != &o) {
d_val_ = std::move(o.d_val_);
stream_ = o.stream_;
o.stream_ = nullptr;
}
return *this;
}
DeviceScalar(const DeviceScalar&) = delete;
DeviceScalar& operator=(const DeviceScalar&) = delete;
/** Download from device. Synchronizes the stream on first call;
* subsequent calls in the same expression are cheap (stream already flushed). */
Scalar get() const {
Scalar result;
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&result, d_val_.ptr, sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
/** Implicit conversion — allows `Scalar alpha = deviceScalar` and
* `if (deviceScalar < threshold)`. Triggers sync. */
operator Scalar() const { return get(); }
Scalar* devicePtr() { return static_cast<Scalar*>(d_val_.ptr); }
const Scalar* devicePtr() const { return static_cast<const Scalar*>(d_val_.ptr); }
cudaStream_t stream() const { return stream_; }
// ---- Device-side arithmetic (no host sync) ---------------------------------
// Uses NPP from DeviceScalarOps.h. All results stay on device.
// Currently supports real types only (float, double). Complex types
// fall back to implicit conversion (host sync) for division.
//
// Note: DeviceScalar has no cross-stream readiness tracking. All
// operations must be on the same CUDA stream. This is the natural
// pattern in iterative solvers where one GpuContext owns all work.
friend DeviceScalar operator/(const DeviceScalar& a, const DeviceScalar& b) {
DeviceScalar result(a.stream_);
internal::device_scalar_div(a.devicePtr(), b.devicePtr(), result.devicePtr(), a.stream_);
return result;
}
friend DeviceScalar operator/(Scalar a, const DeviceScalar& b) {
DeviceScalar d_a(a, b.stream_);
return d_a / b;
}
friend DeviceScalar operator/(const DeviceScalar& a, Scalar b) {
DeviceScalar d_b(b, a.stream_);
return a / d_b;
}
DeviceScalar operator-() const {
DeviceScalar result(stream_);
internal::device_scalar_neg(devicePtr(), result.devicePtr(), stream_);
return result;
}
private:
internal::DeviceBuffer d_val_;
cudaStream_t stream_ = nullptr;
};
} // namespace Eigen
#endif // EIGEN_GPU_DEVICE_SCALAR_H

View File

@@ -0,0 +1,117 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Device-resident scalar and element-wise operations via NPP signals.
// Header-only — no custom CUDA kernels needed. Uses nppsDiv, nppsMul,
// nppsMulC from the NPP library (CUDA::npps, part of the CUDA toolkit).
#ifndef EIGEN_GPU_DEVICE_SCALAR_OPS_H
#define EIGEN_GPU_DEVICE_SCALAR_OPS_H
#include <cuda_runtime.h>
#include <npps_arithmetic_and_logical_operations.h>
namespace Eigen {
namespace internal {
// ---- NppStreamContext helper ------------------------------------------------
inline NppStreamContext make_npp_stream_ctx(cudaStream_t stream) {
// Cache device attributes (constant for process lifetime) in a thread-local.
// Only the stream and its flags vary per call.
struct CachedDeviceInfo {
bool initialized = false;
int device_id = 0;
int cc_major = 0;
int cc_minor = 0;
int mp_count = 0;
int max_threads_per_mp = 0;
int max_threads_per_block = 0;
int shared_mem_per_block = 0;
void init() {
if (initialized) return;
cudaGetDevice(&device_id);
cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device_id);
cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device_id);
cudaDeviceGetAttribute(&mp_count, cudaDevAttrMultiProcessorCount, device_id);
cudaDeviceGetAttribute(&max_threads_per_mp, cudaDevAttrMaxThreadsPerMultiProcessor, device_id);
cudaDeviceGetAttribute(&max_threads_per_block, cudaDevAttrMaxThreadsPerBlock, device_id);
cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id);
initialized = true;
}
};
thread_local CachedDeviceInfo cached;
cached.init();
NppStreamContext ctx = {};
ctx.hStream = stream;
ctx.nCudaDeviceId = cached.device_id;
ctx.nCudaDevAttrComputeCapabilityMajor = cached.cc_major;
ctx.nCudaDevAttrComputeCapabilityMinor = cached.cc_minor;
ctx.nMultiProcessorCount = cached.mp_count;
ctx.nMaxThreadsPerMultiProcessor = cached.max_threads_per_mp;
ctx.nMaxThreadsPerBlock = cached.max_threads_per_block;
ctx.nSharedMemPerBlock = cached.shared_mem_per_block;
cudaStreamGetFlags(stream, &ctx.nStreamFlags);
return ctx;
}
// ---- Scalar division: c = a / b (device-resident, async) --------------------
inline void device_scalar_div(const float* a, const float* b, float* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_32f_Ctx(b, a, c, 1, npp_ctx); // NPP: pDst[i] = pSrc2[i] / pSrc1[i]
}
inline void device_scalar_div(const double* a, const double* b, double* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_64f_Ctx(b, a, c, 1, npp_ctx); // NPP: pDst[i] = pSrc2[i] / pSrc1[i]
}
// ---- Scalar negation: c = -a (device-resident, async) -----------------------
inline void device_scalar_neg(const float* a, float* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMulC_32f_Ctx(a, -1.0f, c, 1, npp_ctx);
}
inline void device_scalar_neg(const double* a, double* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMulC_64f_Ctx(a, -1.0, c, 1, npp_ctx);
}
// ---- Element-wise vector multiply: c[i] = a[i] * b[i] ----------------------
inline void device_cwiseProduct(const float* a, const float* b, float* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMul_32f_Ctx(a, b, c, static_cast<size_t>(n), npp_ctx);
}
inline void device_cwiseProduct(const double* a, const double* b, double* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMul_64f_Ctx(a, b, c, static_cast<size_t>(n), npp_ctx);
}
// ---- Element-wise vector division: c[i] = a[i] / b[i] ----------------------
inline void device_cwiseQuotient(const float* a, const float* b, float* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_32f_Ctx(b, a, c, static_cast<size_t>(n), npp_ctx); // NPP: dst = src2 / src1
}
inline void device_cwiseQuotient(const double* a, const double* b, double* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_64f_Ctx(b, a, c, static_cast<size_t>(n), npp_ctx);
}
} // namespace internal
} // namespace Eigen
#endif // EIGEN_GPU_DEVICE_SCALAR_OPS_H

View File

@@ -28,6 +28,8 @@
#include "./CuBlasSupport.h" #include "./CuBlasSupport.h"
#include "./CuSolverSupport.h" #include "./CuSolverSupport.h"
#include <cusparse.h>
#include <cufft.h>
namespace Eigen { namespace Eigen {
@@ -44,38 +46,92 @@ namespace Eigen {
*/ */
class GpuContext { class GpuContext {
public: public:
GpuContext() { /** Create a new context with a dedicated CUDA stream. */
GpuContext() : owns_stream_(true) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_)); init_handles();
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
} }
/** Create a context on an existing stream (e.g., stream 0 = nullptr).
* The caller retains ownership of the stream — this context will not destroy it. */
explicit GpuContext(cudaStream_t stream) : stream_(stream), owns_stream_(false) { init_handles(); }
~GpuContext() { ~GpuContext() {
if (cusparse_) (void)cusparseDestroy(cusparse_);
if (cusolver_) (void)cusolverDnDestroy(cusolver_); if (cusolver_) (void)cusolverDnDestroy(cusolver_);
if (cublas_lt_) (void)cublasLtDestroy(cublas_lt_);
if (cublas_) (void)cublasDestroy(cublas_); if (cublas_) (void)cublasDestroy(cublas_);
if (stream_) (void)cudaStreamDestroy(stream_); if (owns_stream_ && stream_) (void)cudaStreamDestroy(stream_);
} }
// Non-copyable, non-movable (owns library handles). // Non-copyable, non-movable (owns library handles).
GpuContext(const GpuContext&) = delete; GpuContext(const GpuContext&) = delete;
GpuContext& operator=(const GpuContext&) = delete; GpuContext& operator=(const GpuContext&) = delete;
/** Lazily-created thread-local default context. */ /** Get the thread-local default context.
* If setThreadLocal() has been called, returns that context.
* Otherwise lazily creates a new context with a dedicated stream. */
static GpuContext& threadLocal() { static GpuContext& threadLocal() {
GpuContext* override = tl_override_ptr();
if (override) return *override;
thread_local GpuContext ctx; thread_local GpuContext ctx;
return ctx; return ctx;
} }
/** Override the thread-local default context for this thread.
* The caller retains ownership of \p ctx — it must outlive all uses.
* Pass nullptr to restore the lazily-created default. */
static void setThreadLocal(GpuContext* ctx) { tl_override_ptr() = ctx; }
cudaStream_t stream() const { return stream_; } cudaStream_t stream() const { return stream_; }
cublasHandle_t cublasHandle() const { return cublas_; } cublasHandle_t cublasHandle() const { return cublas_; }
cusolverDnHandle_t cusolverHandle() const { return cusolver_; } cusolverDnHandle_t cusolverHandle() const { return cusolver_; }
/** cuBLASLt handle (lazy-initialized on first GEMM call). */
cublasLtHandle_t cublasLtHandle() const {
if (!cublas_lt_) {
EIGEN_CUBLAS_CHECK(cublasLtCreate(&cublas_lt_));
}
return cublas_lt_;
}
/** Workspace buffer for cublasLtMatmul (grown lazily by cublaslt_gemm).
* Not thread-safe — all GEMM calls must be on this context's stream. */
internal::DeviceBuffer* gemmWorkspace() const { return &gemm_workspace_; }
/** cuSPARSE handle (lazy-initialized on first call). */
cusparseHandle_t cusparseHandle() const {
if (!cusparse_) {
cusparseStatus_t s1 = cusparseCreate(&cusparse_);
eigen_assert(s1 == CUSPARSE_STATUS_SUCCESS && "cusparseCreate failed");
EIGEN_UNUSED_VARIABLE(s1);
cusparseStatus_t s2 = cusparseSetStream(cusparse_, stream_);
eigen_assert(s2 == CUSPARSE_STATUS_SUCCESS && "cusparseSetStream failed");
EIGEN_UNUSED_VARIABLE(s2);
}
return cusparse_;
}
private: private:
cudaStream_t stream_ = nullptr; cudaStream_t stream_ = nullptr;
cublasHandle_t cublas_ = nullptr; cublasHandle_t cublas_ = nullptr;
cusolverDnHandle_t cusolver_ = nullptr; cusolverDnHandle_t cusolver_ = nullptr;
mutable cublasLtHandle_t cublas_lt_ = nullptr; // lazy
mutable cusparseHandle_t cusparse_ = nullptr; // lazy
mutable internal::DeviceBuffer gemm_workspace_; // lazy
bool owns_stream_ = true;
static GpuContext*& tl_override_ptr() {
thread_local GpuContext* ptr = nullptr;
return ptr;
}
void init_handles() {
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
}
}; };
} // namespace Eigen } // namespace Eigen

View File

@@ -0,0 +1,232 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU self-adjoint eigenvalue decomposition using cuSOLVER.
//
// Wraps cusolverDnXsyevd (symmetric/Hermitian divide-and-conquer).
// Stores eigenvalues and eigenvectors on device.
//
// Usage:
// GpuSelfAdjointEigenSolver<double> es(A);
// VectorXd eigenvals = es.eigenvalues();
// MatrixXd eigenvecs = es.eigenvectors();
#ifndef EIGEN_GPU_EIGENSOLVER_H
#define EIGEN_GPU_EIGENSOLVER_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuSolverSupport.h"
#include <vector>
namespace Eigen {
template <typename Scalar_>
class GpuSelfAdjointEigenSolver {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
using RealVector = Matrix<RealScalar, Dynamic, 1>;
/** Eigenvalue-only or eigenvalues + eigenvectors. */
enum ComputeMode { EigenvaluesOnly, ComputeEigenvectors };
GpuSelfAdjointEigenSolver() { init_context(); }
template <typename InputType>
explicit GpuSelfAdjointEigenSolver(const EigenBase<InputType>& A, ComputeMode mode = ComputeEigenvectors) {
init_context();
compute(A, mode);
}
~GpuSelfAdjointEigenSolver() {
if (handle_) (void)cusolverDnDestroy(handle_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
GpuSelfAdjointEigenSolver(const GpuSelfAdjointEigenSolver&) = delete;
GpuSelfAdjointEigenSolver& operator=(const GpuSelfAdjointEigenSolver&) = delete;
// ---- Factorization -------------------------------------------------------
template <typename InputType>
GpuSelfAdjointEigenSolver& compute(const EigenBase<InputType>& A, ComputeMode mode = ComputeEigenvectors) {
eigen_assert(A.rows() == A.cols() && "GpuSelfAdjointEigenSolver requires a square matrix");
mode_ = mode;
n_ = A.rows();
info_ = InvalidInput;
info_synced_ = false;
if (n_ == 0) {
info_ = Success;
info_synced_ = true;
return *this;
}
const PlainMatrix mat(A.derived());
lda_ = static_cast<int64_t>(n_);
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
// syevd overwrites A with eigenvectors (if requested).
d_A_ = internal::DeviceBuffer(mat_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
factorize();
return *this;
}
GpuSelfAdjointEigenSolver& compute(const DeviceMatrix<Scalar>& d_A, ComputeMode mode = ComputeEigenvectors) {
eigen_assert(d_A.rows() == d_A.cols() && "GpuSelfAdjointEigenSolver requires a square matrix");
mode_ = mode;
n_ = d_A.rows();
info_ = InvalidInput;
info_synced_ = false;
if (n_ == 0) {
info_ = Success;
info_synced_ = true;
return *this;
}
d_A.waitReady(stream_);
lda_ = static_cast<int64_t>(n_);
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
d_A_ = internal::DeviceBuffer(mat_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
factorize();
return *this;
}
// ---- Accessors -----------------------------------------------------------
ComputationInfo info() const {
sync_info();
return info_;
}
Index cols() const { return n_; }
Index rows() const { return n_; }
// TODO: Add device-side accessors (deviceEigenvalues(), deviceEigenvectors())
// returning DeviceMatrix views of the internal buffers, so users can chain
// GPU operations without round-tripping through host memory.
/** Eigenvalues in ascending order. Downloads from device. */
RealVector eigenvalues() const {
sync_info();
eigen_assert(info_ == Success);
RealVector W(n_);
if (n_ > 0) {
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpy(W.data(), d_W_.ptr, static_cast<size_t>(n_) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
}
return W;
}
/** Eigenvectors (columns). Downloads from device.
* Requires ComputeEigenvectors mode. */
PlainMatrix eigenvectors() const {
sync_info();
eigen_assert(info_ == Success);
eigen_assert(mode_ == ComputeEigenvectors && "eigenvectors() requires ComputeEigenvectors mode");
PlainMatrix V(n_, n_);
if (n_ > 0) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(V.data(), d_A_.ptr,
static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar),
cudaMemcpyDeviceToHost));
}
return V;
}
cudaStream_t stream() const { return stream_; }
private:
cudaStream_t stream_ = nullptr;
cusolverDnHandle_t handle_ = nullptr;
internal::CusolverParams params_;
internal::DeviceBuffer d_A_; // overwritten with eigenvectors by syevd
internal::DeviceBuffer d_W_; // eigenvalues (RealScalar, length n)
internal::DeviceBuffer d_scratch_; // workspace + info
size_t scratch_size_ = 0;
std::vector<char> h_workspace_;
ComputeMode mode_ = ComputeEigenvectors;
Index n_ = 0;
int64_t lda_ = 0;
ComputationInfo info_ = InvalidInput;
int info_word_ = 0;
bool info_synced_ = true;
void init_context() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
ensure_scratch(0);
}
void ensure_scratch(size_t workspace_bytes) {
constexpr size_t kAlign = 16;
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
size_t needed = workspace_bytes + sizeof(int);
if (needed > scratch_size_) {
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
d_scratch_ = internal::DeviceBuffer(needed);
scratch_size_ = needed;
}
}
void* scratch_workspace() const { return d_scratch_.ptr; }
int* scratch_info() const {
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
}
void sync_info() const {
if (!info_synced_) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
const_cast<GpuSelfAdjointEigenSolver*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
const_cast<GpuSelfAdjointEigenSolver*>(this)->info_synced_ = true;
}
}
void factorize() {
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
info_synced_ = false;
info_ = InvalidInput;
d_W_ = internal::DeviceBuffer(static_cast<size_t>(n_) * sizeof(RealScalar));
const cusolverEigMode_t jobz =
(mode_ == ComputeEigenvectors) ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
// Use lower triangle (standard convention).
constexpr cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
size_t dev_ws = 0, host_ws = 0;
EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd_bufferSize(handle_, params_.p, jobz, uplo, static_cast<int64_t>(n_), dtype,
d_A_.ptr, lda_, rtype, d_W_.ptr, dtype, &dev_ws, &host_ws));
ensure_scratch(dev_ws);
h_workspace_.resize(host_ws);
EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd(handle_, params_.p, jobz, uplo, static_cast<int64_t>(n_), dtype, d_A_.ptr,
lda_, rtype, d_W_.ptr, dtype, scratch_workspace(), dev_ws,
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
}
};
} // namespace Eigen
#endif // EIGEN_GPU_EIGENSOLVER_H

308
Eigen/src/GPU/GpuFFT.h Normal file
View File

@@ -0,0 +1,308 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU FFT via cuFFT.
//
// Standalone GPU FFT class with plan caching. Supports 1D and 2D transforms:
// C2C (complex-to-complex), R2C (real-to-complex), C2R (complex-to-real).
//
// Inverse transforms are scaled by 1/n (1D) or 1/(n*m) (2D) so that
// inv(fwd(x)) == x, matching Eigen's FFT convention.
//
// cuFFT plans are cached by (size, type) and reused across calls.
//
// Usage:
// GpuFFT<float> fft;
// VectorXcf X = fft.fwd(x); // 1D C2C or R2C
// VectorXcf y = fft.inv(X); // 1D C2C inverse
// VectorXf r = fft.invReal(X, n); // 1D C2R inverse
// MatrixXcf B = fft.fwd2d(A); // 2D C2C forward
// MatrixXcf C = fft.inv2d(B); // 2D C2C inverse
#ifndef EIGEN_GPU_FFT_H
#define EIGEN_GPU_FFT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuFftSupport.h"
#include "./CuBlasSupport.h"
#include <map>
namespace Eigen {
template <typename Scalar_>
class GpuFFT {
public:
using Scalar = Scalar_;
using Complex = std::complex<Scalar>;
using ComplexVector = Matrix<Complex, Dynamic, 1>;
using RealVector = Matrix<Scalar, Dynamic, 1>;
using ComplexMatrix = Matrix<Complex, Dynamic, Dynamic, ColMajor>;
GpuFFT() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
}
~GpuFFT() {
for (auto& kv : plans_) (void)cufftDestroy(kv.second);
if (cublas_) (void)cublasDestroy(cublas_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
GpuFFT(const GpuFFT&) = delete;
GpuFFT& operator=(const GpuFFT&) = delete;
// ---- 1D Complex-to-Complex ------------------------------------------------
/** Forward 1D C2C FFT. */
template <typename Derived>
ComplexVector fwd(const MatrixBase<Derived>& x,
typename std::enable_if<NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
const ComplexVector input(x.derived());
const int n = static_cast<int>(input.size());
if (n == 0) return ComplexVector(0);
ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
ComplexVector result(n);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
/** Inverse 1D C2C FFT. Scaled by 1/n. */
template <typename Derived>
ComplexVector inv(const MatrixBase<Derived>& X) {
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv() requires complex input");
const ComplexVector input(X.derived());
const int n = static_cast<int>(input.size());
if (n == 0) return ComplexVector(0);
ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
// Scale by 1/n.
scale_device(static_cast<Complex*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
ComplexVector result(n);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
// ---- 1D Real-to-Complex ---------------------------------------------------
/** Forward 1D R2C FFT. Returns n/2+1 complex values (half-spectrum). */
template <typename Derived>
ComplexVector fwd(const MatrixBase<Derived>& x,
typename std::enable_if<!NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
const RealVector input(x.derived());
const int n = static_cast<int>(input.size());
if (n == 0) return ComplexVector(0);
const int n_complex = n / 2 + 1;
ensure_buffers(n * sizeof(Scalar), n_complex * sizeof(Complex));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_1d(n, internal::cufft_r2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(
internal::cufftExecR2C_dispatch(plan, static_cast<Scalar*>(d_in_.ptr), static_cast<Complex*>(d_out_.ptr)));
ComplexVector result(n_complex);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(result.data(), d_out_.ptr, n_complex * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
// ---- 1D Complex-to-Real ---------------------------------------------------
/** Inverse 1D C2R FFT. Input is n/2+1 complex values, output is nfft real values.
* Scaled by 1/nfft. Caller must specify nfft (original real signal length). */
template <typename Derived>
RealVector invReal(const MatrixBase<Derived>& X, Index nfft) {
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "invReal() requires complex input");
const ComplexVector input(X.derived());
const int n = static_cast<int>(nfft);
const int n_complex = n / 2 + 1;
eigen_assert(input.size() == n_complex);
if (n == 0) return RealVector(0);
ensure_buffers(n_complex * sizeof(Complex), n * sizeof(Scalar));
// cuFFT C2R may overwrite the input, so we copy to d_in_.
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_in_.ptr, input.data(), n_complex * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_1d(n, internal::cufft_c2r_type<Scalar>::value);
EIGEN_CUFFT_CHECK(
internal::cufftExecC2R_dispatch(plan, static_cast<Complex*>(d_in_.ptr), static_cast<Scalar*>(d_out_.ptr)));
// Scale by 1/n.
scale_device_real(static_cast<Scalar*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
RealVector result(n);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
// ---- 2D Complex-to-Complex ------------------------------------------------
/** Forward 2D C2C FFT. Input and output are rows x cols complex matrices. */
template <typename Derived>
ComplexMatrix fwd2d(const MatrixBase<Derived>& A) {
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "fwd2d() requires complex input");
const ComplexMatrix input(A.derived());
const int rows = static_cast<int>(input.rows());
const int cols = static_cast<int>(input.cols());
if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
ensure_buffers(total, total);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
ComplexMatrix result(rows, cols);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
/** Inverse 2D C2C FFT. Scaled by 1/(rows*cols). */
template <typename Derived>
ComplexMatrix inv2d(const MatrixBase<Derived>& A) {
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv2d() requires complex input");
const ComplexMatrix input(A.derived());
const int rows = static_cast<int>(input.rows());
const int cols = static_cast<int>(input.cols());
if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
ensure_buffers(total, total);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
// Scale by 1/(rows*cols).
const int total_elems = rows * cols;
scale_device(static_cast<Complex*>(d_out_.ptr), total_elems, Scalar(1) / Scalar(total_elems));
ComplexMatrix result(rows, cols);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
// ---- Accessors ------------------------------------------------------------
cudaStream_t stream() const { return stream_; }
private:
cudaStream_t stream_ = nullptr;
cublasHandle_t cublas_ = nullptr;
std::map<int64_t, cufftHandle> plans_;
internal::DeviceBuffer d_in_;
internal::DeviceBuffer d_out_;
size_t d_in_size_ = 0;
size_t d_out_size_ = 0;
void ensure_buffers(size_t in_bytes, size_t out_bytes) {
if (in_bytes > d_in_size_) {
if (d_in_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
d_in_ = internal::DeviceBuffer(in_bytes);
d_in_size_ = in_bytes;
}
if (out_bytes > d_out_size_) {
if (d_out_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
d_out_ = internal::DeviceBuffer(out_bytes);
d_out_size_ = out_bytes;
}
}
// Plan key encoding: rank (1 bit) | type (4 bits) | dims
static int64_t plan_key_1d(int n, cufftType type) { return (int64_t(n) << 5) | (int64_t(type) << 1) | 0; }
static int64_t plan_key_2d(int rows, int cols, cufftType type) {
return (int64_t(rows) << 35) | (int64_t(cols) << 5) | (int64_t(type) << 1) | 1;
}
cufftHandle get_plan_1d(int n, cufftType type) {
int64_t key = plan_key_1d(n, type);
auto it = plans_.find(key);
if (it != plans_.end()) return it->second;
cufftHandle plan;
EIGEN_CUFFT_CHECK(cufftPlan1d(&plan, n, type, /*batch=*/1));
EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
plans_[key] = plan;
return plan;
}
cufftHandle get_plan_2d(int rows, int cols, cufftType type) {
int64_t key = plan_key_2d(rows, cols, type);
auto it = plans_.find(key);
if (it != plans_.end()) return it->second;
// cuFFT uses row-major (C order) for 2D: first dim = rows, second = cols.
// Eigen matrices are column-major, so we pass (cols, rows) to cuFFT
// to get the correct 2D transform.
cufftHandle plan;
EIGEN_CUFFT_CHECK(cufftPlan2d(&plan, cols, rows, type));
EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
plans_[key] = plan;
return plan;
}
// Scale complex array on device using cuBLAS scal.
void scale_device(Complex* d_ptr, int n, Scalar alpha) { scale_complex(cublas_, d_ptr, n, alpha); }
// Scale real array on device using cuBLAS scal.
void scale_device_real(Scalar* d_ptr, int n, Scalar alpha) { scale_real(cublas_, d_ptr, n, alpha); }
// Type-dispatched cuBLAS scal wrappers (C++14 compatible).
static void scale_complex(cublasHandle_t h, std::complex<float>* p, int n, float a) {
EIGEN_CUBLAS_CHECK(cublasCsscal(h, n, &a, reinterpret_cast<cuComplex*>(p), 1));
}
static void scale_complex(cublasHandle_t h, std::complex<double>* p, int n, double a) {
EIGEN_CUBLAS_CHECK(cublasZdscal(h, n, &a, reinterpret_cast<cuDoubleComplex*>(p), 1));
}
static void scale_real(cublasHandle_t h, float* p, int n, float a) {
EIGEN_CUBLAS_CHECK(cublasSscal(h, n, &a, p, 1));
}
static void scale_real(cublasHandle_t h, double* p, int n, double a) {
EIGEN_CUBLAS_CHECK(cublasDscal(h, n, &a, p, 1));
}
};
} // namespace Eigen
#endif // EIGEN_GPU_FFT_H

View File

@@ -149,7 +149,7 @@ class GpuLLT {
// Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions). // Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions).
const PlainMatrix mat(A.derived()); const PlainMatrix mat(A.derived());
lda_ = static_cast<int64_t>(mat.outerStride()); lda_ = static_cast<int64_t>(mat.rows());
allocate_factor_storage(); allocate_factor_storage();
EIGEN_CUDA_RUNTIME_CHECK( EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_factor_.ptr, mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_)); cudaMemcpyAsync(d_factor_.ptr, mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_));
@@ -163,7 +163,7 @@ class GpuLLT {
eigen_assert(d_A.rows() == d_A.cols()); eigen_assert(d_A.rows() == d_A.cols());
if (!begin_compute(d_A.rows())) return *this; if (!begin_compute(d_A.rows())) return *this;
lda_ = static_cast<int64_t>(d_A.outerStride()); lda_ = static_cast<int64_t>(d_A.rows());
d_A.waitReady(stream_); d_A.waitReady(stream_);
allocate_factor_storage(); allocate_factor_storage();
EIGEN_CUDA_RUNTIME_CHECK( EIGEN_CUDA_RUNTIME_CHECK(
@@ -178,7 +178,7 @@ class GpuLLT {
eigen_assert(d_A.rows() == d_A.cols()); eigen_assert(d_A.rows() == d_A.cols());
if (!begin_compute(d_A.rows())) return *this; if (!begin_compute(d_A.rows())) return *this;
lda_ = static_cast<int64_t>(d_A.outerStride()); lda_ = static_cast<int64_t>(d_A.rows());
d_A.waitReady(stream_); d_A.waitReady(stream_);
d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release())); d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
@@ -205,7 +205,7 @@ class GpuLLT {
const PlainMatrix rhs(B); const PlainMatrix rhs(B);
const int64_t nrhs = static_cast<int64_t>(rhs.cols()); const int64_t nrhs = static_cast<int64_t>(rhs.cols());
const int64_t ldb = static_cast<int64_t>(rhs.outerStride()); const int64_t ldb = static_cast<int64_t>(rhs.rows());
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) { DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
EIGEN_CUDA_RUNTIME_CHECK( EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x_ptr, rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_)); cudaMemcpyAsync(d_x_ptr, rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
@@ -234,7 +234,7 @@ class GpuLLT {
eigen_assert(d_B.rows() == n_); eigen_assert(d_B.rows() == n_);
d_B.waitReady(stream_); d_B.waitReady(stream_);
const int64_t nrhs = static_cast<int64_t>(d_B.cols()); const int64_t nrhs = static_cast<int64_t>(d_B.cols());
const int64_t ldb = static_cast<int64_t>(d_B.outerStride()); const int64_t ldb = static_cast<int64_t>(d_B.rows());
return solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) { return solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
EIGEN_CUDA_RUNTIME_CHECK( EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x_ptr, d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_)); cudaMemcpyAsync(d_x_ptr, d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
@@ -332,7 +332,7 @@ class GpuLLT {
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, static_cast<int64_t>(n_), nrhs, dtype, EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, static_cast<int64_t>(n_), nrhs, dtype,
d_factor_.ptr, lda_, dtype, d_x_ptr, ldb, scratch_info())); d_factor_.ptr, lda_, dtype, d_x_ptr, ldb, scratch_info()));
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs), static_cast<Index>(ldb)); DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs));
result.recordReady(stream_); result.recordReady(stream_);
return result; return result;
} }

View File

@@ -140,7 +140,7 @@ class GpuLU {
if (!begin_compute(A.rows())) return *this; if (!begin_compute(A.rows())) return *this;
const PlainMatrix mat(A.derived()); const PlainMatrix mat(A.derived());
lda_ = static_cast<int64_t>(mat.outerStride()); lda_ = static_cast<int64_t>(mat.rows());
allocate_lu_storage(); allocate_lu_storage();
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_)); EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_));
@@ -153,7 +153,7 @@ class GpuLU {
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix"); eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
if (!begin_compute(d_A.rows())) return *this; if (!begin_compute(d_A.rows())) return *this;
lda_ = static_cast<int64_t>(d_A.outerStride()); lda_ = static_cast<int64_t>(d_A.rows());
d_A.waitReady(stream_); d_A.waitReady(stream_);
allocate_lu_storage(); allocate_lu_storage();
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_)); EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_));
@@ -167,7 +167,7 @@ class GpuLU {
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix"); eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
if (!begin_compute(d_A.rows())) return *this; if (!begin_compute(d_A.rows())) return *this;
lda_ = static_cast<int64_t>(d_A.outerStride()); lda_ = static_cast<int64_t>(d_A.rows());
d_A.waitReady(stream_); d_A.waitReady(stream_);
d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release())); d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
@@ -190,7 +190,7 @@ class GpuLU {
const PlainMatrix rhs(B); const PlainMatrix rhs(B);
const int64_t nrhs = static_cast<int64_t>(rhs.cols()); const int64_t nrhs = static_cast<int64_t>(rhs.cols());
const int64_t ldb = static_cast<int64_t>(rhs.outerStride()); const int64_t ldb = static_cast<int64_t>(rhs.rows());
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) { DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
EIGEN_CUDA_RUNTIME_CHECK( EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x_ptr, rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_)); cudaMemcpyAsync(d_x_ptr, rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
@@ -213,7 +213,7 @@ class GpuLU {
eigen_assert(d_B.rows() == n_); eigen_assert(d_B.rows() == n_);
d_B.waitReady(stream_); d_B.waitReady(stream_);
const int64_t nrhs = static_cast<int64_t>(d_B.cols()); const int64_t nrhs = static_cast<int64_t>(d_B.cols());
const int64_t ldb = static_cast<int64_t>(d_B.outerStride()); const int64_t ldb = static_cast<int64_t>(d_B.rows());
return solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) { return solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
EIGEN_CUDA_RUNTIME_CHECK( EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x_ptr, d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_)); cudaMemcpyAsync(d_x_ptr, d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
@@ -305,7 +305,7 @@ class GpuLU {
lda_, static_cast<const int64_t*>(d_ipiv_.ptr), dtype, d_x_ptr, ldb, lda_, static_cast<const int64_t*>(d_ipiv_.ptr), dtype, d_x_ptr, ldb,
scratch_info())); scratch_info()));
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs), static_cast<Index>(ldb)); DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs));
result.recordReady(stream_); result.recordReady(stream_);
return result; return result;
} }

389
Eigen/src/GPU/GpuQR.h Normal file
View File

@@ -0,0 +1,389 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU QR decomposition using cuSOLVER.
//
// Wraps cusolverDnXgeqrf (factorization), cusolverDnXormqr (apply Q),
// cusolverDnXorgqr (form Q), and cublasXtrsm (triangular solve on R).
//
// The factored matrix (reflectors + R) and tau stay in device memory.
// Solve uses ormqr + trsm without forming Q explicitly.
//
// Usage:
// GpuQR<double> qr(A); // upload A, geqrf
// if (qr.info() != Success) { ... }
// MatrixXd X = qr.solve(B); // Q^H * B via ormqr, then trsm on R
//
// Expression syntax:
// d_X = d_A.qr().solve(d_B); // temporary, no caching
#ifndef EIGEN_GPU_QR_H
#define EIGEN_GPU_QR_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuSolverSupport.h"
#include "./CuBlasSupport.h"
#include <vector>
namespace Eigen {
template <typename Scalar_>
class GpuQR {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
GpuQR() { init_context(); }
template <typename InputType>
explicit GpuQR(const EigenBase<InputType>& A) {
init_context();
compute(A);
}
~GpuQR() {
if (handle_) (void)cusolverDnDestroy(handle_);
if (cublas_) (void)cublasDestroy(cublas_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
GpuQR(const GpuQR&) = delete;
GpuQR& operator=(const GpuQR&) = delete;
GpuQR(GpuQR&& o) noexcept
: stream_(o.stream_),
handle_(o.handle_),
cublas_(o.cublas_),
params_(std::move(o.params_)),
d_qr_(std::move(o.d_qr_)),
d_tau_(std::move(o.d_tau_)),
d_scratch_(std::move(o.d_scratch_)),
scratch_size_(o.scratch_size_),
h_workspace_(std::move(o.h_workspace_)),
m_(o.m_),
n_(o.n_),
lda_(o.lda_),
info_(o.info_),
info_word_(o.info_word_),
info_synced_(o.info_synced_) {
o.stream_ = nullptr;
o.handle_ = nullptr;
o.cublas_ = nullptr;
o.scratch_size_ = 0;
o.m_ = 0;
o.n_ = 0;
o.lda_ = 0;
o.info_ = InvalidInput;
o.info_word_ = 0;
o.info_synced_ = true;
}
GpuQR& operator=(GpuQR&& o) noexcept {
if (this != &o) {
if (handle_) (void)cusolverDnDestroy(handle_);
if (cublas_) (void)cublasDestroy(cublas_);
if (stream_) (void)cudaStreamDestroy(stream_);
stream_ = o.stream_;
handle_ = o.handle_;
cublas_ = o.cublas_;
params_ = std::move(o.params_);
d_qr_ = std::move(o.d_qr_);
d_tau_ = std::move(o.d_tau_);
d_scratch_ = std::move(o.d_scratch_);
scratch_size_ = o.scratch_size_;
h_workspace_ = std::move(o.h_workspace_);
m_ = o.m_;
n_ = o.n_;
lda_ = o.lda_;
info_ = o.info_;
info_word_ = o.info_word_;
info_synced_ = o.info_synced_;
o.stream_ = nullptr;
o.handle_ = nullptr;
o.cublas_ = nullptr;
o.scratch_size_ = 0;
o.m_ = 0;
o.n_ = 0;
o.lda_ = 0;
o.info_ = InvalidInput;
o.info_word_ = 0;
o.info_synced_ = true;
}
return *this;
}
// ---- Factorization -------------------------------------------------------
template <typename InputType>
GpuQR& compute(const EigenBase<InputType>& A) {
m_ = A.rows();
n_ = A.cols();
info_ = InvalidInput;
info_synced_ = false;
if (m_ == 0 || n_ == 0) {
info_ = Success;
info_synced_ = true;
return *this;
}
const PlainMatrix mat(A.derived());
lda_ = static_cast<int64_t>(mat.rows());
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
const size_t tau_bytes = static_cast<size_t>((std::min)(m_, n_)) * sizeof(Scalar);
d_qr_ = internal::DeviceBuffer(mat_bytes);
d_tau_ = internal::DeviceBuffer(tau_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_qr_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
factorize();
return *this;
}
GpuQR& compute(const DeviceMatrix<Scalar>& d_A) {
m_ = d_A.rows();
n_ = d_A.cols();
info_ = InvalidInput;
info_synced_ = false;
if (m_ == 0 || n_ == 0) {
info_ = Success;
info_synced_ = true;
return *this;
}
lda_ = static_cast<int64_t>(d_A.rows());
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
const size_t tau_bytes = static_cast<size_t>((std::min)(m_, n_)) * sizeof(Scalar);
d_A.waitReady(stream_);
d_qr_ = internal::DeviceBuffer(mat_bytes);
d_tau_ = internal::DeviceBuffer(tau_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_qr_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
factorize();
return *this;
}
// ---- Solve ---------------------------------------------------------------
/** Solve A * X = B via QR: X = R^{-1} * Q^H * B (least-squares for m >= n).
* Uses ormqr (apply Q^H) + trsm (solve R), without forming Q explicitly.
* Requires m >= n (overdetermined or square). Underdetermined not supported.
*
* TODO: Add device-side accessor for the R factor (and Q application) as
* DeviceMatrix, so users can chain GPU operations without host round-trips. */
template <typename Rhs>
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
sync_info();
eigen_assert(info_ == Success && "GpuQR::solve called on a failed or uninitialized factorization");
eigen_assert(B.rows() == m_);
eigen_assert(m_ >= n_ && "GpuQR::solve requires m >= n (use SVD for underdetermined systems)");
const PlainMatrix rhs(B);
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
const int64_t ldb = static_cast<int64_t>(rhs.rows()); // = m_
const size_t b_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
// Upload B to device (m × nrhs buffer).
internal::DeviceBuffer d_B(b_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.ptr, rhs.data(), b_bytes, cudaMemcpyHostToDevice, stream_));
// Apply Q^H to B in-place: d_B becomes m × nrhs, first n rows hold Q^H * B relevant part.
apply_QH(d_B.ptr, ldb, nrhs);
// Solve R * X = (Q^H * B)[0:n,:] via trsm on the first n rows.
Scalar alpha(1);
EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT, static_cast<int>(n_), static_cast<int>(nrhs), &alpha,
static_cast<const Scalar*>(d_qr_.ptr), static_cast<int>(lda_),
static_cast<Scalar*>(d_B.ptr), static_cast<int>(ldb)));
// Download the first n rows of each column (stride = ldb = m, width = n).
PlainMatrix X(n_, rhs.cols());
if (m_ == n_) {
// Square: dense copy, no stride mismatch.
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_B.ptr,
static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar),
cudaMemcpyDeviceToHost, stream_));
} else {
// Overdetermined: 2D copy to extract first n rows from each column.
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(
X.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_B.ptr, static_cast<size_t>(ldb) * sizeof(Scalar),
static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs), cudaMemcpyDeviceToHost, stream_));
}
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return X;
}
/** Solve with device-resident RHS. Returns n × nrhs DeviceMatrix. */
DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
sync_info();
eigen_assert(info_ == Success && "GpuQR::solve called on a failed or uninitialized factorization");
eigen_assert(d_B.rows() == m_);
eigen_assert(m_ >= n_ && "GpuQR::solve requires m >= n (use SVD for underdetermined systems)");
d_B.waitReady(stream_);
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
const int64_t ldb = static_cast<int64_t>(d_B.rows()); // = m_
const size_t b_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
// D2D copy B into working buffer (ormqr and trsm are in-place).
internal::DeviceBuffer d_work(b_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_work.ptr, d_B.data(), b_bytes, cudaMemcpyDeviceToDevice, stream_));
apply_QH(d_work.ptr, ldb, nrhs);
// trsm on the first n rows.
Scalar alpha(1);
EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT, static_cast<int>(n_), static_cast<int>(nrhs), &alpha,
static_cast<const Scalar*>(d_qr_.ptr), static_cast<int>(lda_),
static_cast<Scalar*>(d_work.ptr), static_cast<int>(ldb)));
if (m_ == n_) {
// Square: result is the whole buffer, dense.
DeviceMatrix<Scalar> result(static_cast<Scalar*>(d_work.ptr), n_, static_cast<Index>(nrhs));
d_work.ptr = nullptr; // transfer ownership
result.recordReady(stream_);
return result;
} else {
// Overdetermined: copy first n rows of each column into a dense n × nrhs result.
DeviceMatrix<Scalar> result(n_, static_cast<Index>(nrhs));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(result.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_work.ptr,
static_cast<size_t>(ldb) * sizeof(Scalar),
static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs),
cudaMemcpyDeviceToDevice, stream_));
result.recordReady(stream_);
return result;
// d_work freed here via RAII — safe because stream is ordered.
}
}
// ---- Accessors -----------------------------------------------------------
ComputationInfo info() const {
sync_info();
return info_;
}
Index rows() const { return m_; }
Index cols() const { return n_; }
cudaStream_t stream() const { return stream_; }
private:
cudaStream_t stream_ = nullptr;
cusolverDnHandle_t handle_ = nullptr;
cublasHandle_t cublas_ = nullptr;
internal::CusolverParams params_;
internal::DeviceBuffer d_qr_; // QR factors (reflectors in lower, R in upper)
internal::DeviceBuffer d_tau_; // Householder scalars (min(m,n))
internal::DeviceBuffer d_scratch_; // workspace + info word
size_t scratch_size_ = 0;
std::vector<char> h_workspace_;
Index m_ = 0;
Index n_ = 0;
int64_t lda_ = 0;
ComputationInfo info_ = InvalidInput;
int info_word_ = 0;
bool info_synced_ = true;
void init_context() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
ensure_scratch(0);
}
void ensure_scratch(size_t workspace_bytes) {
constexpr size_t kAlign = 16;
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
size_t needed = workspace_bytes + sizeof(int);
if (needed > scratch_size_) {
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
d_scratch_ = internal::DeviceBuffer(needed);
scratch_size_ = needed;
}
}
void* scratch_workspace() const { return d_scratch_.ptr; }
int* scratch_info() const {
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
}
void sync_info() const {
if (!info_synced_) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
const_cast<GpuQR*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
const_cast<GpuQR*>(this)->info_synced_ = true;
}
}
void factorize() {
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
info_synced_ = false;
info_ = InvalidInput;
size_t dev_ws = 0, host_ws = 0;
EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf_bufferSize(handle_, params_.p, static_cast<int64_t>(m_),
static_cast<int64_t>(n_), dtype, d_qr_.ptr, lda_, dtype,
d_tau_.ptr, dtype, &dev_ws, &host_ws));
ensure_scratch(dev_ws);
h_workspace_.resize(host_ws);
EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf(handle_, params_.p, static_cast<int64_t>(m_), static_cast<int64_t>(n_), dtype,
d_qr_.ptr, lda_, dtype, d_tau_.ptr, dtype, scratch_workspace(), dev_ws,
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
}
// Apply Q^H to a device buffer in-place: d_B = Q^H * d_B.
// Uses type-specific ormqr (real) or unmqr (complex) wrappers from CuSolverSupport.h.
// For real types: Q^H = Q^T, use CUBLAS_OP_T. For complex: use CUBLAS_OP_C.
void apply_QH(void* d_B, int64_t ldb, int64_t nrhs) const {
const int im = static_cast<int>(m_);
const int in = static_cast<int>(nrhs);
const int ik = static_cast<int>((std::min)(m_, n_));
const int ilda = static_cast<int>(lda_);
const int ildb = static_cast<int>(ldb);
constexpr cublasOperation_t trans = NumTraits<Scalar>::IsComplex ? CUBLAS_OP_C : CUBLAS_OP_T;
int lwork = 0;
EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr_bufferSize(
handle_, CUBLAS_SIDE_LEFT, trans, im, in, ik, static_cast<const Scalar*>(d_qr_.ptr), ilda,
static_cast<const Scalar*>(d_tau_.ptr), static_cast<const Scalar*>(d_B), ildb, &lwork));
internal::DeviceBuffer d_work(static_cast<size_t>(lwork) * sizeof(Scalar));
EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr(handle_, CUBLAS_SIDE_LEFT, trans, im, in, ik,
static_cast<const Scalar*>(d_qr_.ptr), ilda,
static_cast<const Scalar*>(d_tau_.ptr), static_cast<Scalar*>(d_B),
ildb, static_cast<Scalar*>(d_work.ptr), lwork, scratch_info()));
// Sync to ensure workspace can be freed safely, and check ormqr info.
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
int ormqr_info = 0;
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(&ormqr_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost));
eigen_assert(ormqr_info == 0 && "cusolverDnXormqr reported an error");
}
};
} // namespace Eigen
#endif // EIGEN_GPU_QR_H

495
Eigen/src/GPU/GpuSVD.h Normal file
View File

@@ -0,0 +1,495 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU SVD decomposition using cuSOLVER (divide-and-conquer).
//
// Wraps cusolverDnXgesvd. Stores U, S, VT on device. Solve uses
// cuBLAS GEMM: X = VT^H * diag(D) * U^H * B.
//
// cuSOLVER returns VT (not V). We store and expose VT directly.
//
// Usage:
// GpuSVD<double> svd(A, ComputeThinU | ComputeThinV);
// VectorXd S = svd.singularValues();
// MatrixXd U = svd.matrixU(); // m×k or m×m
// MatrixXd V = svd.matrixV(); // n×k or n×n (matches JacobiSVD)
// MatrixXd VT = svd.matrixVT(); // k×n or n×n (this is V^T)
// MatrixXd X = svd.solve(B); // pseudoinverse
// MatrixXd X = svd.solve(B, k); // truncated (top k triplets)
// MatrixXd X = svd.solve(B, 0.1); // Tikhonov regularized
#ifndef EIGEN_GPU_SVD_H
#define EIGEN_GPU_SVD_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuSolverSupport.h"
#include "./CuBlasSupport.h"
#include <vector>
namespace Eigen {
template <typename Scalar_>
class GpuSVD {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
using RealVector = Matrix<RealScalar, Dynamic, 1>;
GpuSVD() { init_context(); }
template <typename InputType>
explicit GpuSVD(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
init_context();
compute(A, options);
}
~GpuSVD() {
if (handle_) (void)cusolverDnDestroy(handle_);
if (cublas_lt_) (void)cublasLtDestroy(cublas_lt_);
if (cublas_) (void)cublasDestroy(cublas_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
GpuSVD(const GpuSVD&) = delete;
GpuSVD& operator=(const GpuSVD&) = delete;
// Move constructors omitted for brevity — follow GpuQR pattern.
// ---- Factorization -------------------------------------------------------
template <typename InputType>
GpuSVD& compute(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
options_ = options;
m_ = A.rows();
n_ = A.cols();
info_ = InvalidInput;
info_synced_ = false;
if (m_ == 0 || n_ == 0) {
info_ = Success;
info_synced_ = true;
return *this;
}
// cuSOLVER gesvd requires m >= n. For wide matrices, transpose internally.
transposed_ = (m_ < n_);
const PlainMatrix mat = transposed_ ? PlainMatrix(A.derived().adjoint()) : PlainMatrix(A.derived());
if (transposed_) std::swap(m_, n_);
lda_ = static_cast<int64_t>(mat.rows());
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
// Copy (possibly transposed) A to device (gesvd overwrites it).
d_A_ = internal::DeviceBuffer(mat_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
factorize();
return *this;
}
GpuSVD& compute(const DeviceMatrix<Scalar>& d_A, unsigned int options = ComputeThinU | ComputeThinV) {
options_ = options;
m_ = d_A.rows();
n_ = d_A.cols();
info_ = InvalidInput;
info_synced_ = false;
if (m_ == 0 || n_ == 0) {
info_ = Success;
info_synced_ = true;
return *this;
}
transposed_ = (m_ < n_);
d_A.waitReady(stream_);
if (transposed_) {
// Transpose on device via cuBLAS geam: d_A_ = A^H.
std::swap(m_, n_);
lda_ = m_;
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
d_A_ = internal::DeviceBuffer(mat_bytes);
Scalar alpha_one(1), beta_zero(0);
// geam: C(m×n) = alpha * op(A) + beta * op(B). Use B = nullptr trick: beta=0.
// A is the original d_A (n_orig × m_orig = n × m after swap), transposed → m × n.
EIGEN_CUBLAS_CHECK(internal::cublasXgeam(
cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(m_), static_cast<int>(n_), &alpha_one, d_A.data(),
static_cast<int>(d_A.rows()), &beta_zero, static_cast<const Scalar*>(nullptr), static_cast<int>(m_),
static_cast<Scalar*>(d_A_.ptr), static_cast<int>(m_)));
} else {
lda_ = static_cast<int64_t>(d_A.rows());
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
d_A_ = internal::DeviceBuffer(mat_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
}
factorize();
return *this;
}
// ---- Accessors -----------------------------------------------------------
ComputationInfo info() const {
sync_info();
return info_;
}
Index rows() const { return transposed_ ? n_ : m_; }
Index cols() const { return transposed_ ? m_ : n_; }
// TODO: Add device-side accessors (deviceU(), deviceVT(), deviceSingularValues())
// returning DeviceMatrix views of the internal buffers, so users can chain
// GPU operations without round-tripping through host memory.
/** Singular values (always available). Downloads from device on each call. */
RealVector singularValues() const {
sync_info();
eigen_assert(info_ == Success);
const Index k = (std::min)(m_, n_);
RealVector S(k);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpy(S.data(), d_S_.ptr, static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
return S;
}
/** Left singular vectors U. Returns m_orig × k or m_orig × m_orig.
* For transposed case (m_orig < n_orig), U comes from cuSOLVER's VT. */
PlainMatrix matrixU() const {
sync_info();
eigen_assert(info_ == Success);
eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "matrixU() requires ComputeThinU or ComputeFullU");
const Index m_orig = transposed_ ? n_ : m_;
const Index n_orig = transposed_ ? m_ : n_;
const Index k = (std::min)(m_orig, n_orig);
if (!transposed_) {
const Index ucols = (options_ & ComputeFullU) ? m_ : k;
PlainMatrix U(m_, ucols);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U.data(), d_U_.ptr,
static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
cudaMemcpyDeviceToHost));
return U;
} else {
// Transposed: U_orig = VT_stored^H. VT_stored is vtrows × n_ (= vtrows × m_orig).
const Index vtrows = (options_ & ComputeFullU) ? m_orig : k; // Note: FullU maps to FullV of A^H
PlainMatrix VT_stored(vtrows, n_);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT_stored.data(), d_VT_.ptr,
static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
cudaMemcpyDeviceToHost));
return VT_stored.adjoint(); // m_orig × vtrows
}
}
/** Right singular vectors V. Returns n_orig × k or n_orig × n_orig.
* Equivalent to matrixVT().adjoint(). Matches Eigen's JacobiSVD::matrixV() API. */
PlainMatrix matrixV() const { return matrixVT().adjoint(); }
/** Right singular vectors transposed V^T. Returns k × n_orig or n_orig × n_orig.
* For transposed case, VT comes from cuSOLVER's U. */
PlainMatrix matrixVT() const {
sync_info();
eigen_assert(info_ == Success);
eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "matrixVT() requires ComputeThinV or ComputeFullV");
const Index m_orig = transposed_ ? n_ : m_;
const Index n_orig = transposed_ ? m_ : n_;
const Index k = (std::min)(m_orig, n_orig);
if (!transposed_) {
const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
PlainMatrix VT(vtrows, n_);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT.data(), d_VT_.ptr,
static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
cudaMemcpyDeviceToHost));
return VT;
} else {
// Transposed: VT_orig = U_stored^H. U_stored is m_ × ucols (= n_orig × ucols).
const Index ucols = (options_ & ComputeFullV) ? n_orig : k; // FullV maps to FullU of A^H
PlainMatrix U_stored(m_, ucols);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U_stored.data(), d_U_.ptr,
static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
cudaMemcpyDeviceToHost));
return U_stored.adjoint(); // ucols × n_orig
}
}
/** Number of singular values above threshold. */
Index rank(RealScalar threshold = RealScalar(-1)) const {
RealVector S = singularValues();
if (S.size() == 0) return 0;
if (threshold < 0) {
threshold = (std::max)(m_, n_) * S(0) * NumTraits<RealScalar>::epsilon();
}
Index r = 0;
for (Index i = 0; i < S.size(); ++i) {
if (S(i) > threshold) ++r;
}
return r;
}
// ---- Solve ---------------------------------------------------------------
/** Pseudoinverse solve: X = V * diag(1/S) * U^H * B. */
template <typename Rhs>
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
return solve_impl(B, (std::min)(m_, n_), RealScalar(0));
}
/** Truncated solve: use only top trunc singular triplets. */
template <typename Rhs>
PlainMatrix solve(const MatrixBase<Rhs>& B, Index trunc) const {
eigen_assert(trunc > 0 && trunc <= (std::min)(m_, n_));
return solve_impl(B, trunc, RealScalar(0));
}
/** Tikhonov-regularized solve: D_ii = S_i / (S_i^2 + lambda^2). */
template <typename Rhs>
PlainMatrix solve(const MatrixBase<Rhs>& B, RealScalar lambda) const {
eigen_assert(lambda > 0);
return solve_impl(B, (std::min)(m_, n_), lambda);
}
cudaStream_t stream() const { return stream_; }
private:
cudaStream_t stream_ = nullptr;
cusolverDnHandle_t handle_ = nullptr;
cublasHandle_t cublas_ = nullptr;
cublasLtHandle_t cublas_lt_ = nullptr;
mutable internal::DeviceBuffer gemm_workspace_;
internal::CusolverParams params_;
internal::DeviceBuffer d_A_; // working copy of A (overwritten by gesvd)
internal::DeviceBuffer d_U_; // left singular vectors
internal::DeviceBuffer d_S_; // singular values (RealScalar)
internal::DeviceBuffer d_VT_; // right singular vectors transposed
internal::DeviceBuffer d_scratch_; // workspace + info
size_t scratch_size_ = 0;
std::vector<char> h_workspace_;
unsigned int options_ = 0;
Index m_ = 0;
Index n_ = 0;
int64_t lda_ = 0;
bool transposed_ = false; // true if m < n (we compute SVD of A^T internally)
ComputationInfo info_ = InvalidInput;
int info_word_ = 0;
bool info_synced_ = true;
void init_context() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
EIGEN_CUBLASLT_CHECK(cublasLtCreate(&cublas_lt_));
ensure_scratch(0);
}
void ensure_scratch(size_t workspace_bytes) {
constexpr size_t kAlign = 16;
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
size_t needed = workspace_bytes + sizeof(int);
if (needed > scratch_size_) {
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
d_scratch_ = internal::DeviceBuffer(needed);
scratch_size_ = needed;
}
}
void* scratch_workspace() const { return d_scratch_.ptr; }
int* scratch_info() const {
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
}
void sync_info() const {
if (!info_synced_) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
const_cast<GpuSVD*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
const_cast<GpuSVD*>(this)->info_synced_ = true;
}
}
// Swap U↔V flags for the transposed case.
static unsigned int swap_uv_options(unsigned int opts) {
unsigned int result = 0;
if (opts & ComputeThinU) result |= ComputeThinV;
if (opts & ComputeFullU) result |= ComputeFullV;
if (opts & ComputeThinV) result |= ComputeThinU;
if (opts & ComputeFullV) result |= ComputeFullU;
return result;
}
static signed char jobu(unsigned int opts) {
if (opts & ComputeFullU) return 'A';
if (opts & ComputeThinU) return 'S';
return 'N';
}
static signed char jobvt(unsigned int opts) {
if (opts & ComputeFullV) return 'A';
if (opts & ComputeThinV) return 'S';
return 'N';
}
void factorize() {
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
const Index k = (std::min)(m_, n_);
info_synced_ = false;
info_ = InvalidInput;
// Allocate output buffers. When transposed, swap U/V roles for cuSOLVER.
d_S_ = internal::DeviceBuffer(static_cast<size_t>(k) * sizeof(RealScalar));
// Internal options: for transposed case, what user wants as U we compute as VT of A^H.
const unsigned int int_opts = transposed_ ? swap_uv_options(options_) : options_;
const Index ucols = (int_opts & ComputeFullU) ? m_ : ((int_opts & ComputeThinU) ? k : 0);
const Index vtrows = (int_opts & ComputeFullV) ? n_ : ((int_opts & ComputeThinV) ? k : 0);
const int64_t ldu = m_;
const int64_t ldvt = vtrows > 0 ? vtrows : 1;
if (ucols > 0) d_U_ = internal::DeviceBuffer(static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar));
if (vtrows > 0)
d_VT_ = internal::DeviceBuffer(static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar));
// computeType must match the matrix data type (dtype), not the singular value type (rtype).
eigen_assert(m_ >= n_ && "Internal error: m_ < n_ should have been handled by transpose in compute()");
size_t dev_ws = 0, host_ws = 0;
EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd_bufferSize(
handle_, params_.p, jobu(int_opts), jobvt(int_opts), static_cast<int64_t>(m_), static_cast<int64_t>(n_), dtype,
d_A_.ptr, lda_, rtype, d_S_.ptr, dtype, ucols > 0 ? d_U_.ptr : nullptr, ldu, dtype,
vtrows > 0 ? d_VT_.ptr : nullptr, ldvt, dtype, &dev_ws, &host_ws));
ensure_scratch(dev_ws);
h_workspace_.resize(host_ws);
// Compute SVD.
EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd(handle_, params_.p, jobu(int_opts), jobvt(int_opts), static_cast<int64_t>(m_),
static_cast<int64_t>(n_), dtype, d_A_.ptr, lda_, rtype, d_S_.ptr, dtype,
ucols > 0 ? d_U_.ptr : nullptr, ldu, dtype, vtrows > 0 ? d_VT_.ptr : nullptr,
ldvt, dtype, scratch_workspace(), dev_ws,
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
}
// Internal solve: X = V * diag(D) * U^H * B, using top `trunc` triplets.
// D_ii = 1/S_i (if lambda==0) or S_i/(S_i^2+lambda^2).
//
// For non-transposed: stored U, VT. X = VT^H * D * U^H * B.
// For transposed (SVD of A^H): stored U', VT'. X = U' * D * VT' * B.
template <typename Rhs>
PlainMatrix solve_impl(const MatrixBase<Rhs>& B, Index trunc, RealScalar lambda) const {
sync_info();
eigen_assert(info_ == Success && "GpuSVD::solve called on a failed or uninitialized decomposition");
eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "solve requires U");
eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "solve requires V");
const Index m_orig = transposed_ ? n_ : m_;
const Index n_orig = transposed_ ? m_ : n_;
eigen_assert(B.rows() == m_orig);
const Index k = (std::min)(m_, n_); // = min(m_orig, n_orig)
const Index kk = (std::min)(trunc, k);
const Index nrhs = B.cols();
// Download S to host to build the diagonal scaling.
RealVector S(k);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpy(S.data(), d_S_.ptr, static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
// Upload B (m_orig × nrhs).
const PlainMatrix rhs(B);
internal::DeviceBuffer d_B(static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.ptr, rhs.data(),
static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
cudaMemcpyHostToDevice, stream_));
// Step 1: tmp = U_orig^H * B (kk × nrhs).
// Non-transposed: U_stored is m_×ucols, U_orig = U_stored. Use U_stored^H * B.
// Transposed: U_orig = VT_stored^H, so U_orig^H = VT_stored. Use VT_stored * B (no transpose!).
internal::DeviceBuffer d_tmp(static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar));
{
Scalar alpha_one(1), beta_zero(0);
if (!transposed_) {
// U_stored^H * B: (m_×kk)^H × (m_×nrhs) → kk×nrhs.
internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_C, CUBLAS_OP_N, kk, nrhs, m_, &alpha_one,
static_cast<const Scalar*>(d_U_.ptr), m_, static_cast<const Scalar*>(d_B.ptr),
m_orig, &beta_zero, static_cast<Scalar*>(d_tmp.ptr), kk, &gemm_workspace_,
stream_);
} else {
// VT_stored * B: VT_stored is vtrows×n_ = kk×m_orig (thin), NoTrans.
// vtrows×m_orig times m_orig×nrhs → vtrows×nrhs. Use first kk rows.
const Index vtrows_stored = (swap_uv_options(options_) & ComputeFullV) ? n_ : k;
internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_N, CUBLAS_OP_N, kk, nrhs, m_orig, &alpha_one,
static_cast<const Scalar*>(d_VT_.ptr), vtrows_stored,
static_cast<const Scalar*>(d_B.ptr), m_orig, &beta_zero,
static_cast<Scalar*>(d_tmp.ptr), kk, &gemm_workspace_, stream_);
}
}
// Step 2: Scale row i of tmp by D_ii.
// Download tmp to host, scale, re-upload. (Simple and correct; a device kernel would be faster.)
{
PlainMatrix tmp(kk, nrhs);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(tmp.data(), d_tmp.ptr,
static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar),
cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
for (Index i = 0; i < kk; ++i) {
RealScalar si = S(i);
RealScalar di = (lambda == RealScalar(0)) ? (si > 0 ? RealScalar(1) / si : RealScalar(0))
: si / (si * si + lambda * lambda);
tmp.row(i) *= Scalar(di);
}
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_tmp.ptr, tmp.data(),
static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar),
cudaMemcpyHostToDevice, stream_));
}
// Step 3: X = V_orig * tmp (n_orig × nrhs).
// Non-transposed: V_orig = VT_stored^H. VT_stored[:kk,:]^H * tmp → n_orig × nrhs.
// Transposed: V_orig = U_stored[:,:kk]. U_stored * tmp → n_orig × nrhs (NoTrans).
PlainMatrix X(n_orig, nrhs);
{
internal::DeviceBuffer d_X(static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
Scalar alpha_one(1), beta_zero(0);
if (!transposed_) {
const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_C, CUBLAS_OP_N, n_orig, nrhs, kk, &alpha_one,
static_cast<const Scalar*>(d_VT_.ptr), vtrows,
static_cast<const Scalar*>(d_tmp.ptr), kk, &beta_zero,
static_cast<Scalar*>(d_X.ptr), n_orig, &gemm_workspace_, stream_);
} else {
// U_stored is m_×ucols. V_orig = U_stored[:,:kk]. NoTrans × tmp.
internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_N, CUBLAS_OP_N, n_orig, nrhs, kk, &alpha_one,
static_cast<const Scalar*>(d_U_.ptr), m_, static_cast<const Scalar*>(d_tmp.ptr),
kk, &beta_zero, static_cast<Scalar*>(d_X.ptr), n_orig, &gemm_workspace_,
stream_);
}
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_X.ptr,
static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
}
return X;
}
};
} // namespace Eigen
#endif // EIGEN_GPU_SVD_H

View File

@@ -0,0 +1,481 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix
// multiply (SpMM) via cuSPARSE.
//
// GpuSparseContext manages cuSPARSE descriptors and device buffers. It accepts
// Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the GPU.
// RowMajor input is implicitly converted to ColMajor.
//
// Can borrow a GpuContext for same-stream execution with BLAS-1 ops (zero
// event overhead in iterative solvers like CG).
//
// Usage:
// // Standalone (own stream):
// GpuSparseContext<double> ctx;
// VectorXd y = ctx.multiply(A, x);
//
// // Shared context (same stream as BLAS-1 ops):
// GpuContext gpu_ctx;
// GpuSparseContext<double> sparse_ctx(gpu_ctx);
// VectorXd y = sparse_ctx.multiply(A, x);
//
// // Device-resident (no host roundtrip):
// sparse_ctx.multiply(A, d_x, d_y); // DeviceMatrix in/out
#ifndef EIGEN_GPU_SPARSE_CONTEXT_H
#define EIGEN_GPU_SPARSE_CONTEXT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuSparseSupport.h"
namespace Eigen {
// Forward declarations.
template <typename Scalar_>
class GpuSparseContext;
template <typename Scalar_>
class DeviceSparseView;
/** SpMV expression: DeviceSparseView * DeviceMatrix → SpMVExpr.
* Evaluated by DeviceMatrix::operator=(SpMVExpr). */
template <typename Scalar_>
class SpMVExpr {
public:
using Scalar = Scalar_;
SpMVExpr(const DeviceSparseView<Scalar>& view, const DeviceMatrix<Scalar>& x) : view_(view), x_(x) {}
const DeviceSparseView<Scalar>& view() const { return view_; }
const DeviceMatrix<Scalar>& x() const { return x_; }
private:
const DeviceSparseView<Scalar>& view_;
const DeviceMatrix<Scalar>& x_;
};
/** Device-resident sparse matrix view. Returned by GpuSparseContext::deviceView().
* Lightweight handle referencing the context's cached device data.
*
* \warning One GpuSparseContext caches one sparse matrix at a time.
* Creating a second deviceView on the same context overwrites the first.
* For multiple simultaneous sparse matrices, use separate GpuSparseContext
* instances (they can share a GpuContext for same-stream execution).
*
* Supports `d_y = d_A * d_x` via SpMVExpr. */
template <typename Scalar_>
class DeviceSparseView {
public:
using Scalar = Scalar_;
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
DeviceSparseView(GpuSparseContext<Scalar>& ctx, const SpMat& A) : ctx_(ctx), A_(A) {}
/** SpMV expression: d_A * d_x. Evaluated by DeviceMatrix::operator=. */
SpMVExpr<Scalar> operator*(const DeviceMatrix<Scalar>& x) const { return SpMVExpr<Scalar>(*this, x); }
Index rows() const { return A_.rows(); }
Index cols() const { return A_.cols(); }
const GpuSparseContext<Scalar>& context() const { return ctx_; }
const SpMat& matrix() const { return A_; }
private:
GpuSparseContext<Scalar>& ctx_;
const SpMat& A_;
};
template <typename Scalar_>
class GpuSparseContext {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using StorageIndex = int;
using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
using DenseVector = Matrix<Scalar, Dynamic, 1>;
using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
/** Standalone: creates own stream and cuSPARSE handle. */
GpuSparseContext() : owns_handle_(true) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
owns_stream_ = true;
EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_));
EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_));
}
/** Borrow a GpuContext: shares stream and cuSPARSE handle.
* The GpuContext must outlive this GpuSparseContext. */
explicit GpuSparseContext(GpuContext& ctx)
: stream_(ctx.stream()), handle_(ctx.cusparseHandle()), owns_stream_(false), owns_handle_(false) {}
~GpuSparseContext() {
destroy_descriptors();
if (owns_handle_ && handle_) (void)cusparseDestroy(handle_);
if (owns_stream_ && stream_) (void)cudaStreamDestroy(stream_);
}
GpuSparseContext(const GpuSparseContext&) = delete;
GpuSparseContext& operator=(const GpuSparseContext&) = delete;
// ---- Device sparse view (for expression syntax: d_y = d_A * d_x) ----------
/** Upload a sparse matrix to device and return a lightweight view.
* The sparse data is uploaded immediately and cached in this context.
* The returned view can be used for repeated SpMV without re-uploading.
* If the matrix values change, call deviceView() again to re-upload.
*
* \warning One context caches one matrix. Calling deviceView() again
* overwrites the previous upload. For multiple simultaneous matrices,
* use separate GpuSparseContext instances sharing the same GpuContext.
*
* Supports `d_y = d_A * d_x` expression syntax. */
DeviceSparseView<Scalar> deviceView(const SpMat& A) {
eigen_assert(A.isCompressed());
upload_sparse(A);
return DeviceSparseView<Scalar>(*this, A);
}
// ---- SpMV: y = A * x (host vectors) --------------------------------------
/** Compute y = A * x. Returns y as a new dense vector. */
template <typename InputType, typename Rhs>
DenseVector multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
const SpMat mat(A.derived());
DenseVector y(mat.rows());
y.setZero();
multiply_host_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
return y;
}
/** Compute y = alpha * op(A) * x + beta * y (in-place, host vectors). */
template <typename InputType, typename Rhs, typename Dest>
void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y,
Scalar alpha = Scalar(1), Scalar beta = Scalar(0),
cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
const SpMat mat(A.derived());
multiply_host_impl(mat, x.derived(), y.derived(), alpha, beta, op);
}
// ---- SpMV: y = A * x (DeviceMatrix, no host roundtrip) -------------------
/** Compute d_y = A * d_x. Device-resident, no host transfer.
* Sparse matrix A is uploaded to device (cached). Dense vectors stay on device. */
template <typename InputType>
void multiply(const SparseMatrixBase<InputType>& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y) {
const SpMat mat(A.derived());
multiply_device_impl(mat, d_x, d_y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
}
/** Compute d_y = alpha * op(A) * d_x + beta * d_y (DeviceMatrix, in-place). */
template <typename InputType>
void multiply(const SparseMatrixBase<InputType>& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y,
Scalar alpha, Scalar beta, cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
const SpMat mat(A.derived());
multiply_device_impl(mat, d_x, d_y, alpha, beta, op);
}
// ---- SpMV transpose -------------------------------------------------------
/** Compute y = A^T * x (host vectors). */
template <typename InputType, typename Rhs>
DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
const SpMat mat(A.derived());
DenseVector y(mat.cols());
y.setZero();
multiply_host_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
return y;
}
// ---- SpMM: Y = A * X (host, multiple RHS) --------------------------------
/** Compute Y = A * X where X is a dense matrix. Returns Y. */
template <typename InputType, typename Rhs>
DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X) {
const SpMat mat(A.derived());
const DenseMatrix rhs(X.derived());
eigen_assert(mat.cols() == rhs.rows());
const Index m = mat.rows();
const Index n = rhs.cols();
if (m == 0 || n == 0 || mat.nonZeros() == 0) return DenseMatrix::Zero(m, n);
DenseMatrix Y = DenseMatrix::Zero(m, n);
spmm_impl(mat, rhs, Y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
return Y;
}
// ---- Accessors ------------------------------------------------------------
cudaStream_t stream() const { return stream_; }
private:
cudaStream_t stream_ = nullptr;
cusparseHandle_t handle_ = nullptr;
bool owns_stream_ = false;
bool owns_handle_ = false;
// Cached device buffers for sparse matrix (grow-only).
internal::DeviceBuffer d_outerPtr_;
internal::DeviceBuffer d_innerIdx_;
internal::DeviceBuffer d_values_;
size_t d_outerPtr_size_ = 0;
size_t d_innerIdx_size_ = 0;
size_t d_values_size_ = 0;
// Cached device buffers for host-API dense vectors (grow-only).
internal::DeviceBuffer d_x_;
internal::DeviceBuffer d_y_;
size_t d_x_size_ = 0;
size_t d_y_size_ = 0;
mutable internal::DeviceBuffer d_workspace_;
mutable size_t d_workspace_size_ = 0;
// Cached cuSPARSE sparse matrix descriptor.
cusparseSpMatDescr_t spmat_desc_ = nullptr;
Index cached_rows_ = -1;
Index cached_cols_ = -1;
Index cached_nnz_ = -1;
// ---- SpMV with host vectors (upload/download per call) --------------------
template <typename RhsDerived, typename DestDerived>
void multiply_host_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
cusparseOperation_t op) {
eigen_assert(A.isCompressed());
const Index m = A.rows();
const Index n = A.cols();
const Index nnz = A.nonZeros();
const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
eigen_assert(x.size() == x_size);
eigen_assert(y.size() == y_size);
if (m == 0 || n == 0 || nnz == 0) {
if (beta == Scalar(0))
y.setZero();
else
y *= beta;
return;
}
upload_sparse(A);
ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar));
const DenseVector x_tmp(x);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x_.ptr, x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar));
if (beta != Scalar(0)) {
const DenseVector y_tmp(y);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_y_.ptr, y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
}
exec_spmv(x_size, y_size, d_x_.ptr, d_y_.ptr, alpha, beta, op);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(y.data(), d_y_.ptr, y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
}
// ---- SpMV with DeviceMatrix (no host transfer) ----------------------------
// Called by public multiply(A, d_x, d_y) — always re-uploads A.
void multiply_device_impl(const SpMat& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y, Scalar alpha,
Scalar beta, cusparseOperation_t op) {
upload_sparse(A);
spmv_device_exec(d_x, d_y, alpha, beta, op);
}
public:
/** Execute SpMV using the already-uploaded sparse matrix (no re-upload).
* Used by SpMVExpr (d_y = d_A * d_x) for cached deviceView() paths.
* The sparse matrix must have been uploaded via deviceView() or multiply(). */
void spmv_device_exec(const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y, Scalar alpha = Scalar(1),
Scalar beta = Scalar(0), cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) const {
eigen_assert(spmat_desc_ && "sparse matrix not uploaded — call deviceView() or multiply() first");
// cuSPARSE SpMV: y must not alias x (undefined behavior).
eigen_assert(d_x.data() != d_y.data() && "SpMV: output aliases input vector");
const Index m = cached_rows_;
const Index n = cached_cols_;
const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
eigen_assert(d_x.rows() * d_x.cols() == x_size);
if (m == 0 || n == 0 || cached_nnz_ == 0) {
d_y.resize(y_size, 1);
if (beta == Scalar(0)) {
d_y.setZero();
}
return;
}
// Ensure d_y is allocated.
if (d_y.rows() * d_y.cols() != y_size) {
d_y.resize(y_size, 1);
}
// Wait for input data to be ready on this stream.
d_x.waitReady(stream_);
d_y.waitReady(stream_);
exec_spmv(x_size, y_size, const_cast<void*>(static_cast<const void*>(d_x.data())), static_cast<void*>(d_y.data()),
alpha, beta, op);
d_y.recordReady(stream_);
}
private:
// ---- Shared SpMV execution ------------------------------------------------
void exec_spmv(Index x_size, Index y_size, void* d_x_ptr, void* d_y_ptr, Scalar alpha, Scalar beta,
cusparseOperation_t op) const {
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr;
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_ptr, dtype));
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_ptr, dtype));
size_t ws_size = 0;
EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
CUSPARSE_SPMV_ALG_DEFAULT, &ws_size));
ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.ptr));
(void)cusparseDestroyDnVec(x_desc);
(void)cusparseDestroyDnVec(y_desc);
}
// ---- SpMM implementation --------------------------------------------------
void spmm_impl(const SpMat& A, const DenseMatrix& X, DenseMatrix& Y, Scalar alpha, Scalar beta,
cusparseOperation_t op) {
eigen_assert(A.isCompressed());
const Index m = A.rows();
const Index n = X.cols();
const Index k = A.cols();
const Index nnz = A.nonZeros();
if (m == 0 || n == 0 || k == 0 || nnz == 0) {
if (beta == Scalar(0))
Y.setZero();
else
Y *= beta;
return;
}
upload_sparse(A);
const size_t x_bytes = static_cast<size_t>(k) * static_cast<size_t>(n) * sizeof(Scalar);
const size_t y_bytes = static_cast<size_t>(m) * static_cast<size_t>(n) * sizeof(Scalar);
ensure_buffer(d_x_, d_x_size_, x_bytes);
ensure_buffer(d_y_, d_y_size_, y_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_x_.ptr, X.data(), x_bytes, cudaMemcpyHostToDevice, stream_));
if (beta != Scalar(0)) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.ptr, Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_));
}
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr;
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k, n, k, d_x_.ptr, dtype, CUSPARSE_ORDER_COL));
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m, n, m, d_y_.ptr, dtype, CUSPARSE_ORDER_COL));
size_t ws_size = 0;
EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_,
x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size));
ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta,
y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.ptr));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.ptr, y_bytes, cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
(void)cusparseDestroyDnMat(x_desc);
(void)cusparseDestroyDnMat(y_desc);
}
// ---- Helpers --------------------------------------------------------------
void upload_sparse(const SpMat& A) {
const Index m = A.rows();
const Index n = A.cols();
const Index nnz = A.nonZeros();
const size_t outer_bytes = static_cast<size_t>(n + 1) * sizeof(StorageIndex);
const size_t inner_bytes = static_cast<size_t>(nnz) * sizeof(StorageIndex);
const size_t val_bytes = static_cast<size_t>(nnz) * sizeof(Scalar);
ensure_buffer(d_outerPtr_, d_outerPtr_size_, outer_bytes);
ensure_buffer(d_innerIdx_, d_innerIdx_size_, inner_bytes);
ensure_buffer(d_values_, d_values_size_, val_bytes);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_outerPtr_.ptr, A.outerIndexPtr(), outer_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_innerIdx_.ptr, A.innerIndexPtr(), inner_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, A.valuePtr(), val_bytes, cudaMemcpyHostToDevice, stream_));
if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) {
destroy_descriptors();
constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value;
EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr,
idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO, val_type));
cached_rows_ = m;
cached_cols_ = n;
cached_nnz_ = nnz;
} else {
EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr));
}
}
void destroy_descriptors() {
if (spmat_desc_) {
(void)cusparseDestroySpMat(spmat_desc_);
spmat_desc_ = nullptr;
}
cached_rows_ = -1;
cached_cols_ = -1;
cached_nnz_ = -1;
}
void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) const {
if (needed > current_size) {
if (buf.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
buf = internal::DeviceBuffer(needed);
current_size = needed;
}
}
};
// ---- DeviceMatrix::operator=(SpMVExpr) out-of-line definition ----------------
// Defined here because it needs the full GpuSparseContext definition.
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const SpMVExpr<Scalar_>& expr) {
// Use spmv_device_exec — the sparse matrix was already uploaded by deviceView().
// No re-upload on repeated SpMV with the same view.
expr.view().context().spmv_device_exec(expr.x(), *this, Scalar_(1), Scalar_(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
return *this;
}
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_CONTEXT_H

View File

@@ -0,0 +1,62 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU sparse LDL^T / LDL^H factorization via cuDSS.
//
// For symmetric indefinite (or Hermitian indefinite) sparse matrices.
// Same three-phase workflow as GpuSparseLLT.
//
// Usage:
// GpuSparseLDLT<double> ldlt(A); // analyze + factorize
// VectorXd x = ldlt.solve(b); // solve
#ifndef EIGEN_GPU_SPARSE_LDLT_H
#define EIGEN_GPU_SPARSE_LDLT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSparseSolverBase.h"
namespace Eigen {
/** GPU sparse LDL^T factorization (symmetric indefinite / Hermitian indefinite).
*
* Wraps cuDSS with CUDSS_MTYPE_SYMMETRIC (real) or CUDSS_MTYPE_HERMITIAN (complex).
* Uses pivoting for numerical stability.
*
* \tparam Scalar_ float, double, complex<float>, or complex<double>
* \tparam UpLo_ Lower (default) or Upper — which triangle of A is stored
*/
template <typename Scalar_, int UpLo_ = Lower>
class GpuSparseLDLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT<Scalar_, UpLo_>> {
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT>;
friend Base;
public:
using Scalar = Scalar_;
enum { UpLo = UpLo_ };
GpuSparseLDLT() = default;
template <typename InputType>
explicit GpuSparseLDLT(const SparseMatrixBase<InputType>& A) {
this->compute(A);
}
static constexpr bool needs_csr_conversion() { return false; }
static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_symmetric_type<Scalar>::value; }
static constexpr cudssMatrixViewType_t cudss_matrix_view() {
return internal::cudss_view_type<UpLo, ColMajor>::value;
}
};
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_LDLT_H

View File

@@ -0,0 +1,62 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU sparse Cholesky (LL^T / LL^H) via cuDSS.
//
// Usage:
// GpuSparseLLT<double> llt(A); // analyze + factorize
// VectorXd x = llt.solve(b); // solve
// llt.analyzePattern(A); // or separate phases
// llt.factorize(A_new); // reuse symbolic analysis
#ifndef EIGEN_GPU_SPARSE_LLT_H
#define EIGEN_GPU_SPARSE_LLT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSparseSolverBase.h"
namespace Eigen {
/** GPU sparse Cholesky factorization (LL^T for real, LL^H for complex).
*
* Wraps cuDSS with CUDSS_MTYPE_SPD (real) or CUDSS_MTYPE_HPD (complex).
* Accepts ColMajor SparseMatrix (CSC), reinterpreted as CSR with swapped
* triangle view for zero-copy upload.
*
* \tparam Scalar_ float, double, complex<float>, or complex<double>
* \tparam UpLo_ Lower (default) or Upper — which triangle of A is stored
*/
template <typename Scalar_, int UpLo_ = Lower>
class GpuSparseLLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT<Scalar_, UpLo_>> {
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT>;
friend Base;
public:
using Scalar = Scalar_;
enum { UpLo = UpLo_ };
GpuSparseLLT() = default;
template <typename InputType>
explicit GpuSparseLLT(const SparseMatrixBase<InputType>& A) {
this->compute(A);
}
static constexpr bool needs_csr_conversion() { return false; }
static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_spd_type<Scalar>::value; }
static constexpr cudssMatrixViewType_t cudss_matrix_view() {
return internal::cudss_view_type<UpLo, ColMajor>::value;
}
};
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_LLT_H

View File

@@ -0,0 +1,59 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU sparse LU factorization via cuDSS.
//
// For general (non-symmetric) sparse matrices. Uses pivoting.
// Same three-phase workflow as GpuSparseLLT.
//
// Usage:
// GpuSparseLU<double> lu(A); // analyze + factorize
// VectorXd x = lu.solve(b); // solve
#ifndef EIGEN_GPU_SPARSE_LU_H
#define EIGEN_GPU_SPARSE_LU_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSparseSolverBase.h"
namespace Eigen {
/** GPU sparse LU factorization (general matrices).
*
* Wraps cuDSS with CUDSS_MTYPE_GENERAL and CUDSS_MVIEW_FULL.
* Accepts ColMajor SparseMatrix (CSC); internally converts to RowMajor
* CSR since cuDSS requires CSR input.
*
* \tparam Scalar_ float, double, complex<float>, or complex<double>
*/
template <typename Scalar_>
class GpuSparseLU : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLU<Scalar_>> {
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLU>;
friend Base;
public:
using Scalar = Scalar_;
GpuSparseLU() = default;
template <typename InputType>
explicit GpuSparseLU(const SparseMatrixBase<InputType>& A) {
this->compute(A);
}
static constexpr bool needs_csr_conversion() { return true; }
static constexpr cudssMatrixType_t cudss_matrix_type() { return CUDSS_MTYPE_GENERAL; }
static constexpr cudssMatrixViewType_t cudss_matrix_view() { return CUDSS_MVIEW_FULL; }
};
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_LU_H

View File

@@ -0,0 +1,356 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Common base for GPU sparse direct solvers (LLT, LDLT, LU) via cuDSS.
//
// All three solver types share the same three-phase workflow
// (analyzePattern → factorize → solve) and differ only in the
// cudssMatrixType_t and cudssMatrixViewType_t passed to cuDSS.
// This CRTP base implements the entire workflow; derived classes
// provide the matrix type/view via static constexpr members.
#ifndef EIGEN_GPU_SPARSE_SOLVER_BASE_H
#define EIGEN_GPU_SPARSE_SOLVER_BASE_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuDssSupport.h"
namespace Eigen {
namespace internal {
/** CRTP base for GPU sparse direct solvers.
*
* \tparam Scalar_ Element type (passed explicitly to avoid incomplete-type issues with CRTP).
* \tparam Derived The concrete solver class (GpuSparseLLT, GpuSparseLDLT, GpuSparseLU).
* Must provide:
* - `static constexpr cudssMatrixType_t cudss_matrix_type()`
* - `static constexpr cudssMatrixViewType_t cudss_matrix_view()`
*/
template <typename Scalar_, typename Derived>
class GpuSparseSolverBase {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using StorageIndex = int;
using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>;
using DenseVector = Matrix<Scalar, Dynamic, 1>;
using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
GpuSparseSolverBase() { init_context(); }
~GpuSparseSolverBase() {
destroy_cudss_objects();
if (handle_) (void)cudssDestroy(handle_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
GpuSparseSolverBase(const GpuSparseSolverBase&) = delete;
GpuSparseSolverBase& operator=(const GpuSparseSolverBase&) = delete;
// ---- Configuration --------------------------------------------------------
/** Set the fill-reducing ordering algorithm. Must be called before compute/analyzePattern. */
void setOrdering(GpuSparseOrdering ordering) { ordering_ = ordering; }
// ---- Factorization --------------------------------------------------------
/** Symbolic analysis + numeric factorization. */
template <typename InputType>
Derived& compute(const SparseMatrixBase<InputType>& A) {
analyzePattern(A);
if (info_ == Success) {
factorize(A);
}
return derived();
}
/** Symbolic analysis only. Uploads sparsity structure to device.
* This phase is synchronous (blocks until complete). */
template <typename InputType>
Derived& analyzePattern(const SparseMatrixBase<InputType>& A) {
const SpMat csc(A.derived());
eigen_assert(csc.rows() == csc.cols() && "GpuSparseSolver requires a square matrix");
eigen_assert(csc.isCompressed() && "GpuSparseSolver requires a compressed sparse matrix");
n_ = csc.rows();
info_ = InvalidInput;
analysis_done_ = false;
if (n_ == 0) {
nnz_ = 0;
info_ = Success;
analysis_done_ = true;
return derived();
}
// For symmetric solvers, ColMajor CSC can be reinterpreted as CSR with
// swapped triangle view (zero copy). For general solvers, we must convert
// to actual RowMajor CSR so cuDSS sees the correct matrix, not A^T.
if (Derived::needs_csr_conversion()) {
const CsrMat csr(csc);
nnz_ = csr.nonZeros();
upload_csr(csr);
} else {
nnz_ = csc.nonZeros();
upload_csr_from_csc(csc);
}
create_cudss_matrix();
apply_ordering_config();
if (data_) EIGEN_CUDSS_CHECK(cudssDataDestroy(handle_, data_));
EIGEN_CUDSS_CHECK(cudssDataCreate(handle_, &data_));
create_placeholder_dense();
EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_ANALYSIS, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
analysis_done_ = true;
info_ = Success;
return derived();
}
/** Numeric factorization using the symbolic analysis from analyzePattern.
*
* \warning The sparsity pattern (outerIndexPtr, innerIndexPtr) must be
* identical to the one passed to analyzePattern(). Only the numerical
* values may change. Passing a different pattern is undefined behavior.
* This matches the contract of CHOLMOD, UMFPACK, and cuDSS's own API.
*
* This phase is asynchronous — info() lazily synchronizes. */
template <typename InputType>
Derived& factorize(const SparseMatrixBase<InputType>& A) {
eigen_assert(analysis_done_ && "factorize() requires analyzePattern() first");
if (n_ == 0) {
info_ = Success;
return derived();
}
// Convert to the same format used in analyzePattern.
// Both temporaries must outlive the async memcpy (pageable H2D is actually
// synchronous w.r.t. the host, but keep them alive for clarity).
const SpMat csc(A.derived());
eigen_assert(csc.rows() == n_ && csc.cols() == n_);
const Scalar* value_ptr;
Index value_nnz;
CsrMat csr_tmp;
if (Derived::needs_csr_conversion()) {
csr_tmp = CsrMat(csc);
value_ptr = csr_tmp.valuePtr();
value_nnz = csr_tmp.nonZeros();
} else {
value_ptr = csc.valuePtr();
value_nnz = csc.nonZeros();
}
eigen_assert(value_nnz == nnz_);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, value_ptr, static_cast<size_t>(nnz_) * sizeof(Scalar),
cudaMemcpyHostToDevice, stream_));
EIGEN_CUDSS_CHECK(cudssMatrixSetValues(d_A_cudss_, d_values_.ptr));
info_ = InvalidInput;
info_synced_ = false;
EIGEN_CUDSS_CHECK(
cudssExecute(handle_, CUDSS_PHASE_FACTORIZATION, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
return derived();
}
// ---- Solve ----------------------------------------------------------------
/** Solve A * X = B. Returns X as a dense matrix.
* Supports single or multiple right-hand sides. */
template <typename Rhs>
DenseMatrix solve(const MatrixBase<Rhs>& B) const {
sync_info();
eigen_assert(info_ == Success && "GpuSparseSolver::solve requires a successful factorization");
eigen_assert(B.rows() == n_);
const DenseMatrix rhs(B);
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
if (n_ == 0) return DenseMatrix(0, rhs.cols());
const size_t rhs_bytes = static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar);
DeviceBuffer d_b(rhs_bytes);
DeviceBuffer d_x(rhs_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_b.ptr, rhs.data(), rhs_bytes, cudaMemcpyHostToDevice, stream_));
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
cudssMatrix_t b_cudss = nullptr, x_cudss = nullptr;
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&b_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_b.ptr,
dtype, CUDSS_LAYOUT_COL_MAJOR));
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&x_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_x.ptr,
dtype, CUDSS_LAYOUT_COL_MAJOR));
EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_SOLVE, config_, data_, d_A_cudss_, x_cudss, b_cudss));
DenseMatrix X(n_, rhs.cols());
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_x.ptr, rhs_bytes, cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
(void)cudssMatrixDestroy(b_cudss);
(void)cudssMatrixDestroy(x_cudss);
return X;
}
// ---- Accessors ------------------------------------------------------------
ComputationInfo info() const {
sync_info();
return info_;
}
Index rows() const { return n_; }
Index cols() const { return n_; }
cudaStream_t stream() const { return stream_; }
protected:
// ---- CUDA / cuDSS handles -------------------------------------------------
cudaStream_t stream_ = nullptr;
cudssHandle_t handle_ = nullptr;
cudssConfig_t config_ = nullptr;
cudssData_t data_ = nullptr;
cudssMatrix_t d_A_cudss_ = nullptr;
cudssMatrix_t d_x_cudss_ = nullptr;
cudssMatrix_t d_b_cudss_ = nullptr;
// ---- Device buffers for CSR arrays ----------------------------------------
DeviceBuffer d_rowPtr_;
DeviceBuffer d_colIdx_;
DeviceBuffer d_values_;
// ---- State ----------------------------------------------------------------
Index n_ = 0;
Index nnz_ = 0;
ComputationInfo info_ = InvalidInput;
bool info_synced_ = true;
bool analysis_done_ = false;
GpuSparseOrdering ordering_ = GpuSparseOrdering::AMD;
private:
Derived& derived() { return static_cast<Derived&>(*this); }
const Derived& derived() const { return static_cast<const Derived&>(*this); }
void init_context() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUDSS_CHECK(cudssCreate(&handle_));
EIGEN_CUDSS_CHECK(cudssSetStream(handle_, stream_));
EIGEN_CUDSS_CHECK(cudssConfigCreate(&config_));
}
void sync_info() const {
if (!info_synced_) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
int cudss_info = 0;
EIGEN_CUDSS_CHECK(cudssDataGet(handle_, data_, CUDSS_DATA_INFO, &cudss_info, sizeof(cudss_info), nullptr));
auto* self = const_cast<GpuSparseSolverBase*>(this);
self->info_ = (cudss_info == 0) ? Success : NumericalIssue;
self->info_synced_ = true;
}
}
void destroy_cudss_objects() {
if (d_A_cudss_) {
(void)cudssMatrixDestroy(d_A_cudss_);
d_A_cudss_ = nullptr;
}
if (d_x_cudss_) {
(void)cudssMatrixDestroy(d_x_cudss_);
d_x_cudss_ = nullptr;
}
if (d_b_cudss_) {
(void)cudssMatrixDestroy(d_b_cudss_);
d_b_cudss_ = nullptr;
}
if (data_) {
(void)cudssDataDestroy(handle_, data_);
data_ = nullptr;
}
if (config_) {
(void)cudssConfigDestroy(config_);
config_ = nullptr;
}
}
// Upload CSR from a RowMajor sparse matrix (native CSR).
void upload_csr(const CsrMat& csr) { upload_compressed(csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr()); }
// Upload CSC arrays reinterpreted as CSR (for symmetric matrices: CSC(A) = CSR(A^T) = CSR(A)).
void upload_csr_from_csc(const SpMat& csc) {
upload_compressed(csc.outerIndexPtr(), csc.innerIndexPtr(), csc.valuePtr());
}
void upload_compressed(const StorageIndex* outer, const StorageIndex* inner, const Scalar* values) {
const size_t rowptr_bytes = static_cast<size_t>(n_ + 1) * sizeof(StorageIndex);
const size_t colidx_bytes = static_cast<size_t>(nnz_) * sizeof(StorageIndex);
const size_t values_bytes = static_cast<size_t>(nnz_) * sizeof(Scalar);
d_rowPtr_ = DeviceBuffer(rowptr_bytes);
d_colIdx_ = DeviceBuffer(colidx_bytes);
d_values_ = DeviceBuffer(values_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_rowPtr_.ptr, outer, rowptr_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_colIdx_.ptr, inner, colidx_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, values, values_bytes, cudaMemcpyHostToDevice, stream_));
}
void create_cudss_matrix() {
if (d_A_cudss_) (void)cudssMatrixDestroy(d_A_cudss_);
constexpr cudaDataType_t idx_type = cudss_index_type<StorageIndex>::value;
constexpr cudaDataType_t val_type = cuda_data_type<Scalar>::value;
constexpr cudssMatrixType_t mtype = Derived::cudss_matrix_type();
constexpr cudssMatrixViewType_t mview = Derived::cudss_matrix_view();
EIGEN_CUDSS_CHECK(cudssMatrixCreateCsr(
&d_A_cudss_, static_cast<int64_t>(n_), static_cast<int64_t>(n_), static_cast<int64_t>(nnz_), d_rowPtr_.ptr,
/*rowEnd=*/nullptr, d_colIdx_.ptr, d_values_.ptr, idx_type, val_type, mtype, mview, CUDSS_BASE_ZERO));
}
void apply_ordering_config() {
cudssAlgType_t alg;
switch (ordering_) {
case GpuSparseOrdering::AMD:
alg = CUDSS_ALG_DEFAULT;
break;
case GpuSparseOrdering::METIS:
alg = CUDSS_ALG_2;
break;
case GpuSparseOrdering::RCM:
alg = CUDSS_ALG_3;
break;
default:
alg = CUDSS_ALG_DEFAULT;
break;
}
EIGEN_CUDSS_CHECK(cudssConfigSet(config_, CUDSS_CONFIG_REORDERING_ALG, &alg, sizeof(alg)));
}
void create_placeholder_dense() {
if (d_x_cudss_) (void)cudssMatrixDestroy(d_x_cudss_);
if (d_b_cudss_) (void)cudssMatrixDestroy(d_b_cudss_);
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_x_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
dtype, CUDSS_LAYOUT_COL_MAJOR));
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_b_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
dtype, CUDSS_LAYOUT_COL_MAJOR));
}
};
} // namespace internal
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_SOLVER_BASE_H

View File

@@ -21,6 +21,7 @@
#include "./InternalHeaderCheck.h" #include "./InternalHeaderCheck.h"
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <vector>
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
@@ -36,26 +37,99 @@ namespace internal {
// ---- RAII: device buffer ---------------------------------------------------- // ---- RAII: device buffer ----------------------------------------------------
// Thread-local pool of small device buffers to avoid cudaMalloc/cudaFree
// overhead for tiny allocations (e.g., DeviceScalar). Buffers up to
// kSmallBufferThreshold bytes are recycled; larger allocations bypass the pool.
template <size_t SmallBufferThreshold = 256, size_t MaxPoolSize = 64>
struct DeviceBufferPool {
static constexpr size_t kSmallBufferThreshold = SmallBufferThreshold;
static constexpr size_t kMaxPoolSize = MaxPoolSize;
struct Entry {
void* ptr;
size_t bytes;
};
~DeviceBufferPool() {
for (auto& e : free_list_) (void)cudaFree(e.ptr);
}
void* allocate(size_t bytes) {
// Search for a buffer of sufficient size.
for (size_t i = 0; i < free_list_.size(); ++i) {
if (free_list_[i].bytes >= bytes) {
void* p = free_list_[i].ptr;
free_list_[i] = free_list_.back();
free_list_.pop_back();
return p;
}
}
// No suitable buffer found — allocate new.
void* p = nullptr;
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
return p;
}
void deallocate(void* p, size_t bytes) {
if (free_list_.size() < kMaxPoolSize) {
free_list_.push_back({p, bytes});
} else {
(void)cudaFree(p);
}
}
static DeviceBufferPool& threadLocal() {
thread_local DeviceBufferPool pool;
return pool;
}
private:
std::vector<Entry> free_list_;
};
struct DeviceBuffer { struct DeviceBuffer {
void* ptr = nullptr; void* ptr = nullptr;
DeviceBuffer() = default; DeviceBuffer() = default;
explicit DeviceBuffer(size_t bytes) { explicit DeviceBuffer(size_t bytes) : size_(bytes) {
if (bytes > 0) EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes)); if (bytes > 0) {
if (bytes <= DeviceBufferPool<>::kSmallBufferThreshold) {
ptr = DeviceBufferPool<>::threadLocal().allocate(bytes);
} else {
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes));
}
}
} }
~DeviceBuffer() { ~DeviceBuffer() {
if (ptr) (void)cudaFree(ptr); // destructor: ignore errors if (ptr) {
if (size_ <= DeviceBufferPool<>::kSmallBufferThreshold) {
DeviceBufferPool<>::threadLocal().deallocate(ptr, size_);
} else {
(void)cudaFree(ptr);
}
}
} }
// Move-only. // Move-only.
DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr) { o.ptr = nullptr; } DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr), size_(o.size_) {
o.ptr = nullptr;
o.size_ = 0;
}
DeviceBuffer& operator=(DeviceBuffer&& o) noexcept { DeviceBuffer& operator=(DeviceBuffer&& o) noexcept {
if (this != &o) { if (this != &o) {
if (ptr) (void)cudaFree(ptr); if (ptr) {
if (size_ <= DeviceBufferPool<>::kSmallBufferThreshold) {
DeviceBufferPool<>::threadLocal().deallocate(ptr, size_);
} else {
(void)cudaFree(ptr);
}
}
ptr = o.ptr; ptr = o.ptr;
size_ = o.size_;
o.ptr = nullptr; o.ptr = nullptr;
o.size_ = 0;
} }
return *this; return *this;
} }
@@ -63,12 +137,19 @@ struct DeviceBuffer {
DeviceBuffer(const DeviceBuffer&) = delete; DeviceBuffer(const DeviceBuffer&) = delete;
DeviceBuffer& operator=(const DeviceBuffer&) = delete; DeviceBuffer& operator=(const DeviceBuffer&) = delete;
size_t size() const { return size_; }
// Adopt an existing device pointer. Caller relinquishes ownership. // Adopt an existing device pointer. Caller relinquishes ownership.
// Adopted buffers bypass the pool on destruction.
static DeviceBuffer adopt(void* p) { static DeviceBuffer adopt(void* p) {
DeviceBuffer b; DeviceBuffer b;
b.ptr = p; b.ptr = p;
b.size_ = DeviceBufferPool<>::kSmallBufferThreshold + 1; // force cudaFree
return b; return b;
} }
private:
size_t size_ = 0;
}; };
// ---- Scalar → cudaDataType_t ------------------------------------------------ // ---- Scalar → cudaDataType_t ------------------------------------------------

View File

@@ -1,8 +1,8 @@
# Eigen GPU Module (`Eigen/GPU`) # Eigen GPU Module (`Eigen/GPU`)
GPU-accelerated dense linear algebra for Eigen users, dispatching to NVIDIA GPU-accelerated linear algebra for Eigen users, dispatching to NVIDIA CUDA
CUDA libraries (cuBLAS, cuSOLVER). Requires CUDA 11.4+. Header-only (link libraries (cuBLAS, cuSOLVER, cuFFT, cuSPARSE, cuDSS). Requires CUDA 11.4+;
against CUDA runtime, cuBLAS, and cuSOLVER). cuDSS features require CUDA 12.0+ and a separate cuDSS install. Header-only.
## Why this module ## Why this module
@@ -10,25 +10,31 @@ Eigen is the linear algebra foundation for a large ecosystem of C++ projects
in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP, in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP,
Open3D), scientific computing (Ceres, Stan), and beyond. Many of these Open3D), scientific computing (Ceres, Stan), and beyond. Many of these
projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations
without dropping down to raw CUDA library APIs. Third-party projects like without dropping down to raw CUDA library APIs.
[EigenCuda](https://github.com/NLESC-JCER/EigenCuda) and
[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically to fill
this gap, and downstream projects like
[Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
GPU-accelerated solvers through Eigen.
The `Eigen/GPU` module aims to close this gap: Existing Eigen users should be GPU sparse solvers are a particularly acute gap. Sparse factorization is the
able to move performance-critical dense linear algebra to the GPU with minimal bottleneck in SLAM, bundle adjustment, FEM, and nonlinear optimization --
code changes and without learning CUDA library APIs directly. exactly the workloads where GPU acceleration matters most. Downstream projects
like [Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
GPU-accelerated sparse solvers, and third-party projects like
[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically because
Eigen lacks them. The `Eigen/GPU` module provides GPU sparse Cholesky, LDL^T,
and LU factorization via cuDSS, alongside dense solvers (cuSOLVER), matrix
products (cuBLAS), FFT (cuFFT), and sparse matrix-vector products (cuSPARSE).
Existing Eigen users should be able to move performance-critical dense or
sparse linear algebra to the GPU with minimal code changes and without
learning CUDA library APIs directly.
## Design philosophy ## Design philosophy
**CPU and GPU coexist.** There is no global compile-time switch that replaces **CPU and GPU coexist.** There is no global compile-time switch that replaces
CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers
explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>` -- and both coexist in explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>`, `GpuSparseLLT<double>` vs
the same binary. This also lets users keep the factored matrix on device across `SimplicialLLT<SparseMatrix<double>>` -- and both coexist in the same binary.
multiple solves, something impossible with compile-time replacement. This also lets users keep the factored matrix on device across multiple solves,
something impossible with compile-time replacement.
**Familiar syntax.** GPU operations use the same expression patterns as CPU **Familiar syntax.** GPU operations use the same expression patterns as CPU
Eigen. Here is a side-by-side comparison: Eigen. Here is a side-by-side comparison:
@@ -38,6 +44,7 @@ Eigen. Here is a side-by-side comparison:
#include <Eigen/Dense> #define EIGEN_USE_GPU #include <Eigen/Dense> #define EIGEN_USE_GPU
#include <Eigen/GPU> #include <Eigen/GPU>
// Dense
MatrixXd A = ...; auto d_A = DeviceMatrix<double>::fromHost(A); MatrixXd A = ...; auto d_A = DeviceMatrix<double>::fromHost(A);
MatrixXd B = ...; auto d_B = DeviceMatrix<double>::fromHost(B); MatrixXd B = ...; auto d_B = DeviceMatrix<double>::fromHost(B);
@@ -45,11 +52,32 @@ MatrixXd C = A * B; DeviceMatrix<double> d_C = d_A * d_B;
MatrixXd X = A.llt().solve(B); DeviceMatrix<double> d_X = d_A.llt().solve(d_B); MatrixXd X = A.llt().solve(B); DeviceMatrix<double> d_X = d_A.llt().solve(d_B);
MatrixXd X = d_X.toHost(); MatrixXd X = d_X.toHost();
// Sparse (using SpMat = SparseMatrix<double>)
SimplicialLLT<SpMat> llt(A); GpuSparseLLT<double> llt(A);
VectorXd x = llt.solve(b); VectorXd x = llt.solve(b);
``` ```
The GPU version reads like CPU Eigen with explicit upload/download. The GPU version reads like CPU Eigen with explicit upload/download for dense
`operator*` dispatches to cuBLAS GEMM, `.llt().solve()` dispatches to operations, and an almost identical API for sparse solvers. Unsupported
cuSOLVER potrf + potrs. Unsupported expressions are compile errors. expressions are compile errors.
**Standalone module.** `Eigen/GPU` does not modify or depend on Eigen's Core
expression template system (`MatrixBase`, `CwiseBinaryOp`, etc.).
`DeviceMatrix` is not an Eigen expression type and does not inherit from
`MatrixBase`. The expression layer is a thin compile-time dispatch where every
supported expression maps to a single NVIDIA library call. There is no
coefficient-level evaluation, lazy fusion, or packet operations.
**Interoperability where useful.** `DeviceMatrix` provides the same operator
signatures as `Matrix` for common vector operations: `+=`, `-=`, `*=`,
`dot()`, `squaredNorm()`, `norm()`, `setZero()`, and `noalias()`. This makes
`DeviceMatrix` usable as a drop-in `VectorType` in Eigen algorithm templates
that rely on these operations. For example, Eigen's `conjugate_gradient()`
template works with `DeviceMatrix` with a single typedef change -- no
modifications to the algorithm or the expression template system. Conjugate
gradient is just the motivating example; we are open to expanding operator
coverage as needed to support other high-level Eigen algorithms on the GPU.
**Explicit over implicit.** Host-device transfers, stream management, and **Explicit over implicit.** Host-device transfers, stream management, and
library handle lifetimes are visible in the API. There are no hidden library handle lifetimes are visible in the API. There are no hidden
@@ -85,6 +113,27 @@ MatrixXd C = transfer.get();
`selfadjointView<UpLo>()`, `llt()`, `lu()`. These return lightweight `selfadjointView<UpLo>()`, `llt()`, `lu()`. These return lightweight
expression objects that are evaluated when assigned. expression objects that are evaluated when assigned.
For BLAS Level-1 operations, `DeviceMatrix` also provides `dot()`, `norm()`,
`squaredNorm()`, `setZero()`, `noalias()`, and arithmetic operators
(`+=`, `-=`, `*=`) that dispatch to cuBLAS `axpy`, `nrm2`, `dot`, and
`geam`. These are the operations needed by iterative solvers.
### `DeviceScalar<Scalar>`
A device-resident scalar value. Reductions like `dot()`, `norm()`, and
`squaredNorm()` return `DeviceScalar` instead of a host scalar, deferring
the host synchronization until the value is actually needed:
```cpp
auto dot_val = d_x.dot(d_y); // DeviceScalar -- no sync
auto norm_sq = d_r.squaredNorm(); // DeviceScalar -- no sync
Scalar alpha = dot_val / norm_sq; // sync here (implicit conversion)
d_x += alpha * d_p; // host scalar * DeviceMatrix (axpy)
```
Division between `DeviceScalar` values (real types only) is performed on
device via NPP, avoiding extra synchronizations.
### `GpuContext` ### `GpuContext`
Every GPU operation needs a CUDA stream and library handles (cuBLAS, Every GPU operation needs a CUDA stream and library handles (cuBLAS,
@@ -107,6 +156,12 @@ d_C1.device(ctx1) = d_A1 * d_B1; // runs on stream 1
d_C2.device(ctx2) = d_A2 * d_B2; // runs on stream 2 (concurrently) d_C2.device(ctx2) = d_A2 * d_B2; // runs on stream 2 (concurrently)
``` ```
To integrate with existing CUDA code, borrow an existing stream:
```cpp
GpuContext ctx(my_existing_stream); // wraps stream, does not take ownership
```
## Usage ## Usage
### Matrix operations (cuBLAS) ### Matrix operations (cuBLAS)
@@ -122,7 +177,7 @@ d_C = d_A * d_B.transpose();
// Scaled and accumulated // Scaled and accumulated
d_C += 2.0 * d_A * d_B; // alpha=2, beta=1 d_C += 2.0 * d_A * d_B; // alpha=2, beta=1
d_C.device(ctx) -= d_A * d_B; // alpha=-1, beta=1 (requires explicit context) d_C.device(ctx) -= d_A * d_B; // alpha=-1, beta=1 (GEMM requires explicit context for -=)
// Triangular solve (TRSM) // Triangular solve (TRSM)
d_X = d_A.triangularView<Lower>().solve(d_B); d_X = d_A.triangularView<Lower>().solve(d_B);
@@ -134,6 +189,30 @@ d_C = d_A.selfadjointView<Lower>() * d_B;
d_C.selfadjointView<Lower>().rankUpdate(d_A); // C += A * A^H d_C.selfadjointView<Lower>().rankUpdate(d_A); // C += A * A^H
``` ```
### BLAS Level-1 operations
```cpp
// Dot product and norms (return DeviceScalar -- no sync until read)
auto dot_val = d_x.dot(d_y); // cublasDdot / cublasCdotc
auto norm_val = d_r.norm(); // cublasDnrm2
double n = norm_val; // implicit conversion triggers sync
// Vector arithmetic (cuBLAS axpy / geam)
d_x += alpha * d_p; // axpy: x = x + alpha * p
d_x -= alpha * d_p; // axpy: x = x - alpha * p
d_x *= alpha; // scal: x = alpha * x
d_r.setZero(); // cudaMemsetAsync
// DeviceScalar arithmetic (stays on device, real types only)
auto alpha = absNew / dot_val; // device-side division via NPP
d_x += alpha * d_p; // DeviceScalar * DeviceMatrix (axpy with device pointer)
// Matrix add/subtract (cuBLAS geam)
DeviceMatrix<double> d_C = d_A + d_B; // C = A + B
d_C = d_A + 2.0 * d_B; // C = A + 2*B
d_C = d_A - d_B; // C = A - B
```
### Dense solvers (cuSOLVER) ### Dense solvers (cuSOLVER)
**One-shot expression syntax** -- Convenient, re-factorizes each time: **One-shot expression syntax** -- Convenient, re-factorizes each time:
@@ -160,10 +239,149 @@ MatrixXd X2 = d_X2.toHost();
GpuLU<double> lu; GpuLU<double> lu;
lu.compute(d_A); lu.compute(d_A);
auto d_Y = lu.solve(d_B, GpuLU<double>::Transpose); // A^T Y = B auto d_Y = lu.solve(d_B, GpuLU<double>::Transpose); // A^T Y = B
// QR solve (overdetermined least squares)
GpuQR<double> qr;
qr.compute(d_A); // factorize on device (async)
auto d_X = qr.solve(d_B); // Q^H * B via ormqr, then trsm on R
MatrixXd X = d_X.toHost();
// SVD (results downloaded on access)
GpuSVD<double> svd;
svd.compute(d_A, ComputeThinU | ComputeThinV);
VectorXd S = svd.singularValues(); // downloads to host
MatrixXd U = svd.matrixU(); // downloads to host
MatrixXd V = svd.matrixV(); // V (matches JacobiSVD)
MatrixXd VT = svd.matrixVT(); // V^T (matches cuSOLVER)
// Self-adjoint eigenvalue decomposition (results downloaded on access)
GpuSelfAdjointEigenSolver<double> es;
es.compute(d_A);
VectorXd eigenvals = es.eigenvalues(); // downloads to host
MatrixXd eigenvecs = es.eigenvectors(); // downloads to host
``` ```
The cached API keeps the factored matrix on device, avoiding redundant The cached API keeps the factored matrix on device, avoiding redundant
host-device transfers and re-factorizations. host-device transfers and re-factorizations. All solvers also accept host
matrices directly as a convenience (e.g., `GpuLLT<double> llt(A)` or
`qr.solve(B)`), which handles upload/download internally.
### Sparse direct solvers (cuDSS)
Requires cuDSS (separate install, CUDA 12.0+). Define `EIGEN_CUDSS` before
including `Eigen/GPU` and link with `-lcudss`.
```cpp
SparseMatrix<double> A = ...; // symmetric positive definite
VectorXd b = ...;
// Sparse Cholesky -- one-liner
GpuSparseLLT<double> llt(A);
VectorXd x = llt.solve(b);
// Three-phase workflow for repeated solves with the same sparsity pattern
GpuSparseLLT<double> llt;
llt.analyzePattern(A); // symbolic analysis (once)
llt.factorize(A); // numeric factorization
VectorXd x = llt.solve(b);
llt.factorize(A_new_values); // refactorize (reuses symbolic analysis)
VectorXd x2 = llt.solve(b);
// Sparse LDL^T (symmetric indefinite)
GpuSparseLDLT<double> ldlt(A);
VectorXd x = ldlt.solve(b);
// Sparse LU (general non-symmetric)
GpuSparseLU<double> lu(A);
VectorXd x = lu.solve(b);
```
### FFT (cuFFT)
```cpp
GpuFFT<float> fft;
// 1D complex-to-complex
VectorXcf X = fft.fwd(x); // forward
VectorXcf y = fft.inv(X); // inverse (scaled by 1/n)
// 1D real-to-complex / complex-to-real
VectorXcf R = fft.fwd(r); // returns n/2+1 complex (half-spectrum)
VectorXf s = fft.invReal(R, n); // C2R inverse, caller specifies n
// 2D complex-to-complex
MatrixXcf B = fft.fwd2d(A); // 2D forward
MatrixXcf C = fft.inv2d(B); // 2D inverse (scaled by 1/(rows*cols))
// Plans are cached and reused across calls with the same size/type.
```
### Sparse matrix-vector multiply (cuSPARSE)
```cpp
SparseMatrix<double> A = ...;
VectorXd x = ...;
// Host vectors (upload/download handled internally)
GpuSparseContext<double> spmv;
VectorXd y = spmv.multiply(A, x); // y = A * x
VectorXd z = spmv.multiplyT(A, x); // z = A^T * x
spmv.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y
MatrixXd Y = spmv.multiplyMat(A, X); // Y = A * X (SpMM)
// Device-resident SpMV (sparse matrix cached on device)
GpuSparseContext<double> spmv(ctx); // share GpuContext for same-stream
auto d_A = spmv.deviceView(A); // upload sparse matrix once
d_y = d_A * d_x; // operator syntax, stays on device
```
### Eigen algorithm interop (example: Conjugate gradient)
The BLAS-1 operators and `DeviceSparseView` make `DeviceMatrix` usable as a
vector type in GPU implementations of algorithms like conjugate gradient.
Conjugate gradient is the motivating example -- a GPU CG implementation
uses the same operations as the CPU version:
```cpp
GpuContext ctx;
GpuSparseContext<double> spmv(ctx);
auto d_A = spmv.deviceView(A); // sparse matrix on device
auto d_b = DeviceMatrix<double>::fromHost(b);
auto d_x = DeviceMatrix<double>::fromHost(x0);
// CG iteration using DeviceMatrix operators
DeviceMatrix<double> d_r = d_b; // r = b (deep copy via geam)
DeviceMatrix<double> d_p(n), d_tmp(n);
d_tmp = d_A * d_x; // SpMV (device-resident)
d_r -= d_tmp; // axpy
d_p = d_r.clone();
RealScalar absNew = d_r.squaredNorm(); // DeviceScalar -> implicit sync
for (int i = 0; i < maxIters && absNew > tol * tol; ++i) {
d_tmp = d_A * d_p; // SpMV
auto alpha = absNew / d_p.dot(d_tmp); // host / DeviceScalar -> DeviceScalar
d_x += alpha * d_p; // axpy with DeviceScalar
d_r -= alpha * d_tmp; // axpy with DeviceScalar
RealScalar absOld = absNew;
absNew = d_r.squaredNorm(); // DeviceScalar -> implicit sync
d_p *= Scalar(absNew / absOld); // scal (host scalars)
d_p += d_r; // axpy
}
MatrixXd x = d_x.toHost();
```
### Precision control
GEMM dispatch uses `cublasLtMatmul` with heuristic algorithm selection,
enabling cuBLAS to choose tensor core algorithms when beneficial. For double
precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64 results
computed faster via tensor cores.
| Macro | Effect |
|---|---|
| *(default)* | Tensor core algorithms enabled. Float uses full FP32. Double may use Ozaki on sm_80+. |
| `EIGEN_CUDA_TF32` | Opt-in: Float uses TF32 (~2x faster, 10-bit mantissa). Double unaffected. |
| `EIGEN_NO_CUDA_TENSOR_OPS` | Opt-out: Pedantic compute types, no tensor cores. For bit-exact reproducibility. |
### Stream control and async execution ### Stream control and async execution
@@ -180,6 +398,7 @@ Mandatory sync points:
- `fromHost()` -- Synchronizes to complete the upload before returning - `fromHost()` -- Synchronizes to complete the upload before returning
- `toHost()` / `HostTransfer::get()` -- Must deliver data to host - `toHost()` / `HostTransfer::get()` -- Must deliver data to host
- `info()` -- Must read the factorization status - `info()` -- Must read the factorization status
- `DeviceScalar` implicit conversion -- Downloads scalar from device
**Cross-stream safety** is automatic. `DeviceMatrix` tracks write completion **Cross-stream safety** is automatic. `DeviceMatrix` tracks write completion
via CUDA events. When a matrix written on stream A is read on stream B, the via CUDA events. When a matrix written on stream A is read on stream B, the
@@ -190,51 +409,121 @@ skip the wait (CUDA guarantees in-order execution within a stream).
### Supported scalar types ### Supported scalar types
`float`, `double`, `std::complex<float>`, `std::complex<double>`. `float`, `double`, `std::complex<float>`, `std::complex<double>` (unless
noted otherwise).
### Expression -> library call mapping ### Expression -> library call mapping
| DeviceMatrix expression | Library call | Parameters | | DeviceMatrix expression | Library call | Parameters |
|---|---|---| |---|---|---|
| `C = A * B` | `cublasGemmEx` | transA=N, transB=N, alpha=1, beta=0 | | `C = A * B` | `cublasLtMatmul` | transA=N, transB=N, alpha=1, beta=0 |
| `C = A.adjoint() * B` | `cublasGemmEx` | transA=C, transB=N | | `C = A.adjoint() * B` | `cublasLtMatmul` | transA=C, transB=N |
| `C = A.transpose() * B` | `cublasGemmEx` | transA=T, transB=N | | `C = A.transpose() * B` | `cublasLtMatmul` | transA=T, transB=N |
| `C = A * B.adjoint()` | `cublasGemmEx` | transA=N, transB=C | | `C = A * B.adjoint()` | `cublasLtMatmul` | transA=N, transB=C |
| `C = A * B.transpose()` | `cublasGemmEx` | transA=N, transB=T | | `C = A * B.transpose()` | `cublasLtMatmul` | transA=N, transB=T |
| `C = alpha * A * B` | `cublasGemmEx` | alpha from LHS | | `C = alpha * A * B` | `cublasLtMatmul` | alpha from LHS |
| `C = A * (alpha * B)` | `cublasGemmEx` | alpha from RHS | | `C = A * (alpha * B)` | `cublasLtMatmul` | alpha from RHS |
| `C += A * B` | `cublasGemmEx` | alpha=1, beta=1 | | `C += A * B` | `cublasLtMatmul` | alpha=1, beta=1 |
| `C.device(ctx) -= A * B` | `cublasGemmEx` | alpha=-1, beta=1 | | `C.device(ctx) -= A * B` | `cublasLtMatmul` | alpha=-1, beta=1 |
| `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs | | `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs |
| `X = A.llt<Upper>().solve(B)` | same | uplo=Upper | | `X = A.llt<Upper>().solve(B)` | same | uplo=Upper |
| `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs | | `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs |
| `X = A.triangularView<L>().solve(B)` | `cublasXtrsm` | side=L, uplo, diag=NonUnit | | `X = A.triangularView<L>().solve(B)` | `cublasXtrsm` | side=L, uplo, diag=NonUnit |
| `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo | | `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
| `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N | | `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
| `C = A + B` | `cublasXgeam` | alpha=1, beta=1 |
| `C = A + alpha * B` | `cublasXgeam` | alpha=1, beta from scaled |
| `C = A - B` | `cublasXgeam` | alpha=1, beta=-1 |
| `C = A - alpha * B` | `cublasXgeam` | alpha=1, beta=-scaled |
| `x += alpha * y` | `cublasXaxpy` | alpha (host scalar) |
| `x += dAlpha * y` | `cublasXaxpy` | alpha (DeviceScalar, device pointer mode) |
| `x -= alpha * y` | `cublasXaxpy` | alpha negated |
| `x *= alpha` | `cublasXscal` | alpha (host or DeviceScalar) |
| `x.dot(y)` | `cublasXdot` / `cublasXdotc` | returns `DeviceScalar` |
| `x.norm()` | `cublasXnrm2` | returns `DeviceScalar<RealScalar>` |
| `x.squaredNorm()` | `cublasXdot(x, x)` | returns `DeviceScalar<RealScalar>` |
| `d_y = view * d_x` | `cusparseSpMV` | device-resident SpMV |
### `DeviceMatrix<Scalar>` API ### `DeviceMatrix<Scalar>`
| Method | Sync? | Description | Typed RAII wrapper for a dense column-major matrix in GPU device memory.
|--------|-------|-------------| Always dense (leading dimension = rows). A vector is a `DeviceMatrix` with
| `DeviceMatrix()` | -- | Empty (0x0) | one column.
| `DeviceMatrix(rows, cols)` | -- | Allocate uninitialized |
| `fromHost(matrix, stream)` | yes | Upload from Eigen matrix | ```cpp
| `fromHostAsync(ptr, rows, cols, outerStride, stream)` | no | Async upload (caller manages lifetime) | // Construction
| `toHost(stream)` | yes | Synchronous download | DeviceMatrix<Scalar>() // Empty (0x0)
| `toHostAsync(stream)` | no | Returns `HostTransfer` future | DeviceMatrix<Scalar>(Index n) // Allocate column vector (n x 1)
| `clone(stream)` | no | Device-to-device deep copy | DeviceMatrix<Scalar>(rows, cols) // Allocate uninitialized
| `resize(rows, cols)` | -- | Discard contents, reallocate |
| `data()` | -- | Raw device pointer | // Upload / download
| `rows()`, `cols()` | -- | Dimensions | static DeviceMatrix fromHost(matrix, stream=nullptr) // -> DeviceMatrix (syncs)
| `sizeInBytes()` | -- | Total device allocation size in bytes | static DeviceMatrix fromHostAsync(ptr, rows, cols, stream) // -> DeviceMatrix (no sync, caller manages ptr lifetime)
| `empty()` | -- | True if 0x0 | PlainMatrix toHost(stream=nullptr) // -> host Matrix (syncs)
| `adjoint()` | -- | Adjoint view (GEMM ConjTrans) | HostTransfer toHostAsync(stream=nullptr) // -> HostTransfer future (no sync)
| `transpose()` | -- | Transpose view (GEMM Trans) | DeviceMatrix clone(stream=nullptr) // -> DeviceMatrix (D2D copy, async)
| `llt()` / `llt<UpLo>()` | -- | Cholesky expression builder |
| `lu()` | -- | LU expression builder | // Dimensions and access
| `triangularView<UpLo>()` | -- | Triangular view (TRSM) | Index rows()
| `selfadjointView<UpLo>()` | -- | Self-adjoint view (SYMM, rankUpdate) | Index cols()
| `device(ctx)` | -- | Assignment proxy bound to context | size_t sizeInBytes()
bool empty()
Scalar* data() // Raw device pointer
void resize(Index rows, Index cols) // Discard contents, reallocate
// Expression builders (return lightweight views, evaluated on assignment)
AdjointView adjoint() // GEMM with ConjTrans
TransposeView transpose() // GEMM with Trans
LltExpr llt() / llt<UpLo>() // -> .solve(d_B) -> DeviceMatrix
LuExpr lu() // -> .solve(d_B) -> DeviceMatrix
TriangularView triangularView<UpLo>() // -> .solve(d_B) -> DeviceMatrix (TRSM)
SelfAdjointView selfadjointView<UpLo>() // -> * d_B (SYMM), .rankUpdate(d_A) (SYRK)
DeviceAssignment device(GpuContext& ctx) // Bind assignment to explicit stream
DeviceMatrix& noalias() // No-op (all ops are implicitly noalias)
// BLAS Level-1 (all have overloads with explicit GpuContext& parameter)
DeviceScalar<Scalar> dot(const DeviceMatrix& other) // cuBLAS dot/dotc -> DeviceScalar
DeviceScalar<RealScalar> norm() // cuBLAS nrm2 -> DeviceScalar
DeviceScalar<RealScalar> squaredNorm() // dot(self, self) -> DeviceScalar (no sync)
void setZero() // cudaMemsetAsync
void addScaled(GpuContext&, Scalar alpha, const DeviceMatrix& x) // this += alpha * x (axpy)
void scale(GpuContext&, Scalar alpha) // this *= alpha (scal)
void copyFrom(GpuContext&, const DeviceMatrix& other) // this = other (D2D copy)
DeviceMatrix& operator+=(Scalar * DeviceMatrix) // cuBLAS axpy
DeviceMatrix& operator-=(Scalar * DeviceMatrix) // cuBLAS axpy (negated)
DeviceMatrix& operator+=(const DeviceMatrix&) // cuBLAS axpy
DeviceMatrix& operator-=(const DeviceMatrix&) // cuBLAS axpy
DeviceMatrix& operator+=(const DeviceScaledDevice&) // cuBLAS axpy (DeviceScalar * DeviceMatrix)
DeviceMatrix& operator-=(const DeviceScaledDevice&) // cuBLAS axpy (DeviceScalar * DeviceMatrix, negated)
DeviceMatrix& operator*=(Scalar) // cuBLAS scal
DeviceMatrix& operator*=(const DeviceScalar<Scalar>&) // cuBLAS scal (device pointer)
DeviceMatrix cwiseProduct(GpuContext&, const DeviceMatrix&) // NPP nppsMul (float/double only)
void cwiseProduct(GpuContext&, const DeviceMatrix&, const DeviceMatrix&) // in-place: this = a .* b
// geam expressions (evaluated on assignment)
DeviceMatrix& operator=(const DeviceAddExpr&) // C = A + B, C = A + alpha*B, C = A - B, etc.
```
### `DeviceScalar<Scalar>`
Device-resident scalar. Returned by `dot()`, `norm()`, and `squaredNorm()`.
Implicit conversion to `Scalar` triggers `cudaStreamSynchronize` + download.
```cpp
DeviceScalar(cudaStream_t stream = nullptr) // Allocate uninitialized
DeviceScalar(Scalar host_val, cudaStream_t stream) // Upload host value
Scalar get() // Download (syncs stream)
operator Scalar() // Implicit conversion (syncs)
Scalar* devicePtr() // Raw device pointer
cudaStream_t stream()
// Device-side arithmetic (no host sync, real types only)
DeviceScalar operator/(DeviceScalar, DeviceScalar) // NPP nppsDiv
DeviceScalar operator/(Scalar, DeviceScalar) // upload + div
DeviceScalar operator/(DeviceScalar, Scalar) // upload + div
DeviceScalar operator-() // NPP nppsMulC(-1)
```
### `GpuContext` ### `GpuContext`
@@ -242,43 +531,221 @@ Unified GPU execution context owning a CUDA stream and library handles.
```cpp ```cpp
GpuContext() // Creates dedicated stream + handles GpuContext() // Creates dedicated stream + handles
GpuContext(cudaStream_t stream) // Borrow existing stream (not owned)
static GpuContext& threadLocal() // Per-thread default (lazy-created) static GpuContext& threadLocal() // Per-thread default (lazy-created)
static void setThreadLocal(GpuContext* ctx) // Override thread-local default (nullptr restores)
cudaStream_t stream() cudaStream_t stream()
cublasHandle_t cublasHandle() cublasHandle_t cublasHandle()
cusolverDnHandle_t cusolverHandle() cusolverDnHandle_t cusolverHandle()
cublasLtHandle_t cublasLtHandle() // Lazy-initialized
cusparseHandle_t cusparseHandle() // Lazy-initialized
``` ```
Non-copyable, non-movable (owns library handles). Non-copyable, non-movable (owns library handles).
### `GpuLLT<Scalar, UpLo>` API ### `GpuLLT<Scalar, UpLo>` -- Dense Cholesky (cuSOLVER)
GPU dense Cholesky (LL^T) via cuSOLVER. Caches factor on device. Caches the Cholesky factor on device for repeated solves.
| Method | Sync? | Description | ```cpp
|--------|-------|-------------| GpuLLT() // Default construct, then call compute()
| `GpuLLT(A)` | deferred | Construct and factorize from host matrix | GpuLLT(const EigenBase<D>& A) // Convenience: upload + factorize
| `compute(host_matrix)` | deferred | Upload and factorize |
| `compute(DeviceMatrix)` | deferred | D2D copy and factorize |
| `compute(DeviceMatrix&&)` | deferred | Move-adopt and factorize (no copy) |
| `solve(host_matrix)` | yes | Solve, return host matrix |
| `solve(DeviceMatrix)` | no | Solve, return `DeviceMatrix` (async) |
| `info()` | lazy | Syncs stream on first call, returns `Success` or `NumericalIssue` |
### `GpuLU<Scalar>` API GpuLLT& compute(const EigenBase<D>& A) // Upload + factorize
GpuLLT& compute(const DeviceMatrix& d_A) // D2D copy + factorize
GpuLLT& compute(DeviceMatrix&& d_A) // Adopt + factorize (no copy)
GPU dense partial-pivoting LU via cuSOLVER. Same pattern as `GpuLLT`, plus PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
`TransposeMode` parameter on `solve()` (`NoTranspose`, `Transpose`, DeviceMatrix solve(const DeviceMatrix& d_B) // -> DeviceMatrix (async, stays on device)
`ConjugateTranspose`).
### `HostTransfer<Scalar>` API ComputationInfo info() // Lazy sync on first call: Success or NumericalIssue
Index rows() / cols()
cudaStream_t stream()
```
Future for async device-to-host transfer. ### `GpuLU<Scalar>` -- Dense LU (cuSOLVER)
| Method | Description | Same pattern as `GpuLLT`. Adds `TransposeMode` parameter on `solve()`.
|--------|-------------|
| `get()` | Block until transfer completes, return host matrix reference. Idempotent. | ```cpp
| `ready()` | Non-blocking poll | PlainMatrix solve(const MatrixBase<D>& B, TransposeMode m = NoTranspose) // -> host Matrix
DeviceMatrix solve(const DeviceMatrix& d_B, TransposeMode m = NoTranspose) // -> DeviceMatrix
```
`TransposeMode`: `NoTranspose`, `Transpose`, `ConjugateTranspose`.
### `GpuQR<Scalar>` -- Dense QR (cuSOLVER)
QR factorization via `cusolverDnXgeqrf`. Solve uses ORMQR (apply Q^H) + TRSM
(back-substitute on R) -- Q is never formed explicitly.
```cpp
GpuQR() // Default construct
GpuQR(const EigenBase<D>& A) // Convenience: upload + factorize
GpuQR& compute(const EigenBase<D>& A) // Upload + factorize
GpuQR& compute(const DeviceMatrix& d_A) // D2D copy + factorize
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
DeviceMatrix solve(const DeviceMatrix& d_B) // -> DeviceMatrix (async)
ComputationInfo info() // Lazy sync
Index rows() / cols()
cudaStream_t stream()
```
### `GpuSVD<Scalar>` -- Dense SVD (cuSOLVER)
SVD via `cusolverDnXgesvd`. Supports `ComputeThinU | ComputeThinV`,
`ComputeFullU | ComputeFullV`, or `0` (values only). Wide matrices (m < n)
handled by internal transpose.
```cpp
GpuSVD() // Default construct, then call compute()
GpuSVD(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV) // Convenience
GpuSVD& compute(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV)
GpuSVD& compute(const DeviceMatrix& d_A, unsigned options = ComputeThinU | ComputeThinV)
RealVector singularValues() // -> host vector (syncs, downloads)
PlainMatrix matrixU() // -> host Matrix (syncs, downloads)
PlainMatrix matrixV() // -> host Matrix (V = VT^H, matches JacobiSVD)
PlainMatrix matrixVT() // -> host Matrix (syncs, downloads V^T)
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (pseudoinverse)
PlainMatrix solve(const MatrixBase<D>& B, Index k) // Truncated (top k triplets)
PlainMatrix solve(const MatrixBase<D>& B, RealScalar l) // Tikhonov regularized
Index rank(RealScalar threshold = -1)
ComputationInfo info() // Lazy sync
Index rows() / cols()
cudaStream_t stream()
```
**Note:** `singularValues()`, `matrixU()`, `matrixV()`, and `matrixVT()`
download to host on each call. Device-side accessors returning `DeviceMatrix`
are planned but not yet implemented.
### `GpuSelfAdjointEigenSolver<Scalar>` -- Eigendecomposition (cuSOLVER)
Symmetric/Hermitian eigenvalue decomposition via `cusolverDnXsyevd`.
`ComputeMode` enum: `EigenvaluesOnly`, `ComputeEigenvectors`.
```cpp
GpuSelfAdjointEigenSolver() // Default construct, then call compute()
GpuSelfAdjointEigenSolver(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors) // Convenience
GpuSelfAdjointEigenSolver& compute(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors)
GpuSelfAdjointEigenSolver& compute(const DeviceMatrix& d_A, ComputeMode mode = ComputeEigenvectors)
RealVector eigenvalues() // -> host vector (syncs, downloads, ascending order)
PlainMatrix eigenvectors() // -> host Matrix (syncs, downloads, columns)
ComputationInfo info() // Lazy sync
Index rows() / cols()
cudaStream_t stream()
```
**Note:** `eigenvalues()` and `eigenvectors()` download to host on each call.
Device-side accessors returning `DeviceMatrix` are planned but not yet
implemented.
### `HostTransfer<Scalar>`
Future for async device-to-host transfer. Returned by
`DeviceMatrix::toHostAsync()`.
```cpp
PlainMatrix& get() // Block until complete, return host Matrix ref. Idempotent.
bool ready() // Non-blocking poll
```
### `GpuSparseLLT<Scalar, UpLo>` -- Sparse Cholesky (cuDSS)
Requires cuDSS (CUDA 12.0+, `#define EIGEN_CUDSS`). Three-phase workflow
with symbolic reuse. Accepts `SparseMatrix<Scalar, ColMajor, int>` (CSC).
```cpp
GpuSparseLLT() // Default construct
GpuSparseLLT(const SparseMatrixBase<D>& A) // Analyze + factorize
GpuSparseLLT& analyzePattern(const SparseMatrixBase<D>& A) // Symbolic analysis (reusable)
GpuSparseLLT& factorize(const SparseMatrixBase<D>& A) // Numeric factorization
GpuSparseLLT& compute(const SparseMatrixBase<D>& A) // analyzePattern + factorize
void setOrdering(GpuSparseOrdering ord) // AMD (default), METIS, or RCM
DenseMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
ComputationInfo info() // Lazy sync
Index rows() / cols()
cudaStream_t stream()
```
### `GpuSparseLDLT<Scalar, UpLo>` -- Sparse LDL^T (cuDSS)
Symmetric indefinite. Same API as `GpuSparseLLT`.
### `GpuSparseLU<Scalar>` -- Sparse LU (cuDSS)
General non-symmetric. Same API as `GpuSparseLLT` (without `UpLo`).
### `GpuFFT<Scalar>` -- FFT (cuFFT)
Plans cached by (size, type) and reused. Inverse transforms scaled so
`inv(fwd(x)) == x`. Supported scalars: `float`, `double`.
```cpp
// 1D transforms (host vectors in and out)
ComplexVector fwd(const MatrixBase<D>& x) // C2C forward (complex input)
ComplexVector fwd(const MatrixBase<D>& x) // R2C forward (real input, returns n/2+1)
ComplexVector inv(const MatrixBase<D>& X) // C2C inverse, scaled by 1/n
RealVector invReal(const MatrixBase<D>& X, Index n) // C2R inverse, scaled by 1/n
// 2D transforms (host matrices in and out)
ComplexMatrix fwd2d(const MatrixBase<D>& A) // 2D C2C forward
ComplexMatrix inv2d(const MatrixBase<D>& A) // 2D C2C inverse, scaled by 1/(rows*cols)
cudaStream_t stream()
```
All FFT methods accept host data and return host data. Upload/download is
handled internally. The C2C and R2C overloads of `fwd()` are distinguished by
the input scalar type (complex vs real).
### `GpuSparseContext<Scalar>` -- SpMV/SpMM (cuSPARSE)
Accepts `SparseMatrix<Scalar, ColMajor>`.
```cpp
GpuSparseContext() // Creates own stream + cuSPARSE handle
GpuSparseContext(GpuContext& ctx) // Borrow GpuContext for same-stream execution
// Host data in/out
DenseVector multiply(A, x) // y = A * x
void multiply(A, x, y, alpha=1, beta=0, // y = alpha*op(A)*x + beta*y
op=CUSPARSE_OPERATION_NON_TRANSPOSE)
DenseVector multiplyT(A, x) // y = A^T * x
DenseMatrix multiplyMat(A, X) // Y = A * X (SpMM)
// DeviceMatrix in/out (sparse matrix re-uploaded each call)
void multiply(A, d_x, d_y) // SpMV with device vectors
void multiply(A, d_x, d_y, alpha, beta, op)
// Device-resident sparse matrix (upload once, reuse)
DeviceSparseView deviceView(A) // Upload sparse matrix, return view
cudaStream_t stream()
```
### `DeviceSparseView<Scalar>` -- Device-resident sparse matrix
Returned by `GpuSparseContext::deviceView()`. Holds a sparse matrix on device
for repeated SpMV without re-uploading.
```cpp
SpMVExpr operator*(const DeviceMatrix& d_x) // d_y = view * d_x (evaluated on assignment)
```
### Aliasing ### Aliasing
@@ -286,7 +753,9 @@ Unlike Eigen's `Matrix`, where omitting `.noalias()` triggers a copy to a
temporary, DeviceMatrix dispatches directly to NVIDIA library calls which have temporary, DeviceMatrix dispatches directly to NVIDIA library calls which have
no built-in aliasing protection. All operations are implicitly noalias. no built-in aliasing protection. All operations are implicitly noalias.
The caller must ensure operands don't alias the destination for GEMM and TRSM The caller must ensure operands don't alias the destination for GEMM and TRSM
(debug asserts catch violations). (debug asserts catch violations). `geam` expressions (`d_C = d_A + alpha * d_B`)
are safe with aliasing. The `.noalias()` method exists as a no-op for Eigen
template compatibility.
## File layout ## File layout
@@ -294,15 +763,29 @@ The caller must ensure operands don't alias the destination for GEMM and TRSM
|------|-----------|----------| |------|-----------|----------|
| `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `cuda_data_type<>` | | `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `cuda_data_type<>` |
| `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` | | `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` |
| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM expression wrappers | | `DeviceExpr.h` | `DeviceMatrix.h` | GEMM and geam expression wrappers |
| `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers | | `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers |
| `DeviceSolverExpr.h` | `DeviceMatrix.h` | Solver expression wrappers (LLT, LU) | | `DeviceSolverExpr.h` | `DeviceMatrix.h` | Solver expression wrappers (LLT, LU) |
| `DeviceScalar.h` | `GpuSupport.h`, `DeviceScalarOps.h` | `DeviceScalar<>` (device-resident scalar) |
| `DeviceScalarOps.h` | `<npps_*.h>` | Scalar div/neg/cwiseProduct via NPP |
| `DeviceDispatch.h` | all above | All dispatch functions + `DeviceAssignment` | | `DeviceDispatch.h` | all above | All dispatch functions + `DeviceAssignment` |
| `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `GpuContext` | | `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `GpuContext` |
| `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>` | cuBLAS error macro, op/compute type maps | | `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>`, `<cublasLt.h>` | cuBLAS/cuBLASLt error macro, type maps |
| `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping | | `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization | | `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization | | `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
| `GpuQR.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense QR decomposition |
| `GpuSVD.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense SVD decomposition |
| `GpuEigenSolver.h` | `CuSolverSupport.h` | Self-adjoint eigenvalue decomposition |
| `CuFftSupport.h` | `GpuSupport.h`, `<cufft.h>` | cuFFT error macro, type-dispatch wrappers |
| `GpuFFT.h` | `CuFftSupport.h`, `CuBlasSupport.h` | 1D/2D FFT with plan caching |
| `CuSparseSupport.h` | `GpuSupport.h`, `<cusparse.h>` | cuSPARSE error macro |
| `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE, `DeviceSparseView` |
| `CuDssSupport.h` | `GpuSupport.h`, `<cudss.h>` | cuDSS error macro, type traits (optional) |
| `GpuSparseSolverBase.h` | `CuDssSupport.h` | CRTP base for sparse solvers (optional) |
| `GpuSparseLLT.h` | `GpuSparseSolverBase.h` | Sparse Cholesky via cuDSS (optional) |
| `GpuSparseLDLT.h` | `GpuSparseSolverBase.h` | Sparse LDL^T via cuDSS (optional) |
| `GpuSparseLU.h` | `GpuSparseSolverBase.h` | Sparse LU via cuDSS (optional) |
## Building and testing ## Building and testing
@@ -313,6 +796,42 @@ cmake -G Ninja -B build -S . \
-DEIGEN_TEST_CUBLAS=ON \ -DEIGEN_TEST_CUBLAS=ON \
-DEIGEN_TEST_CUSOLVER=ON -DEIGEN_TEST_CUSOLVER=ON
cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu gpu_device_matrix cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu \
ctest --test-dir build -R "gpu_cublas|gpu_cusolver|gpu_device" --output-on-failure gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen \
gpu_device_matrix gpu_cufft gpu_cusparse_spmv gpu_cg
ctest --test-dir build -R "gpu_" --output-on-failure
# Sparse solvers (cuDSS -- separate install required)
cmake -G Ninja -B build -S . \
-DEIGEN_TEST_CUDA=ON \
-DEIGEN_CUDA_COMPUTE_ARCH="70" \
-DEIGEN_TEST_CUDSS=ON
cmake --build build --target gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu
ctest --test-dir build -R gpu_cudss --output-on-failure
``` ```
## Future work
- **Device-side accessors for decomposition results.** `GpuSVD`,
`GpuSelfAdjointEigenSolver`, and `GpuQR` currently download decomposition
results to host on access (e.g., `svd.matrixU()` returns a host `MatrixXd`).
Device-side accessors returning `DeviceMatrix` views of the internal buffers
would allow chaining GPU operations (e.g., `svd.deviceU() * d_A`) without
round-tripping through host memory.
- **Batched API (`DeviceBatchMatrix`).** A strided batch of N identical-size
matrices dispatching to cuBLAS/cuSOLVER batched APIs (`cublasDgemmBatched`,
`cusolverDnXpotrfBatched`, etc.). This enables robotics and model-predictive
control workloads where many small independent systems are solved in
parallel.
- **cuTENSOR for Tensor module.** Replace the hand-written GPU tensor
contraction and reduction kernels (~2300 lines in
`TensorContractionGpu.h` / `TensorReductionGpu.h`) with cuTENSOR dispatch,
following the same library-dispatch pattern used by `Eigen/GPU`.
- **Unified/zero-copy memory for Jetson.** Use `cudaMallocManaged` or
`cudaHostAllocMapped` to eliminate `fromHost()` / `toHost()` copies on
integrated GPUs (Jetson) where CPU and GPU share DRAM.
- **Device-side Eigen interop.** Bridge between host-side `DeviceMatrix`
dispatch and device-side Eigen expression templates (Core + Tensor) running
inside CUDA kernels. Raw-pointer + `Map` / `TensorMap` as the zero-copy
interop surface.

View File

@@ -31,7 +31,10 @@ EIGEN_DONT_INLINE void conjugate_gradient(const MatrixType& mat, const Rhs& rhs,
Index& iters, typename Dest::RealScalar& tol_error) { Index& iters, typename Dest::RealScalar& tol_error) {
typedef typename Dest::RealScalar RealScalar; typedef typename Dest::RealScalar RealScalar;
typedef typename Dest::Scalar Scalar; typedef typename Dest::Scalar Scalar;
typedef Matrix<Scalar, Dynamic, 1> VectorType; // Use Dest's plain (owning) type as VectorType. For CPU Matrix/Map this
// resolves to Matrix<Scalar,Dynamic,1>. For GPU DeviceMatrix, PlainObject
// is DeviceMatrix itself (already owning).
typedef typename Dest::PlainObject VectorType;
RealScalar tol = tol_error; RealScalar tol = tol_error;
Index maxIters = iters; Index maxIters = iters;

View File

@@ -11,7 +11,7 @@
# ncu --set full -o profile ./build-bench-gpu/bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096 # ncu --set full -o profile ./build-bench-gpu/bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
cmake_minimum_required(VERSION 3.18) cmake_minimum_required(VERSION 3.18)
project(EigenGpuBenchmarks CXX) project(EigenGpuBenchmarks CXX CUDA)
find_package(benchmark REQUIRED) find_package(benchmark REQUIRED)
find_package(CUDAToolkit REQUIRED) find_package(CUDAToolkit REQUIRED)
@@ -51,3 +51,41 @@ eigen_add_gpu_benchmark(bench_gpu_chaining_float bench_gpu_chaining.cpp DEFINITI
# Batching benchmarks: multi-stream concurrency for many small systems. # Batching benchmarks: multi-stream concurrency for many small systems.
eigen_add_gpu_benchmark(bench_gpu_batching bench_gpu_batching.cpp) eigen_add_gpu_benchmark(bench_gpu_batching bench_gpu_batching.cpp)
eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITIONS SCALAR=float) eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITIONS SCALAR=float)
# FFT benchmarks: 1D/2D C2C, R2C, C2R throughput and plan reuse.
eigen_add_gpu_benchmark(bench_gpu_fft bench_gpu_fft.cpp LIBRARIES CUDA::cufft)
eigen_add_gpu_benchmark(bench_gpu_fft_double bench_gpu_fft.cpp LIBRARIES CUDA::cufft DEFINITIONS SCALAR=double)
# CG sync overhead benchmark: host vs device pointer mode for reductions.
# Uses CUDA kernels for device scalar arithmetic.
add_executable(bench_gpu_cg_sync bench_gpu_cg_sync.cu)
target_include_directories(bench_gpu_cg_sync PRIVATE
${EIGEN_SOURCE_DIR}
${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(bench_gpu_cg_sync PRIVATE
benchmark::benchmark benchmark::benchmark_main
CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
target_compile_options(bench_gpu_cg_sync PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
target_compile_definitions(bench_gpu_cg_sync PRIVATE EIGEN_USE_GPU)
# GPU CG vs CPU CG comparison benchmark.
add_executable(bench_gpu_cg_vs_cpu bench_gpu_cg_vs_cpu.cu)
target_include_directories(bench_gpu_cg_vs_cpu PRIVATE
${EIGEN_SOURCE_DIR}
${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(bench_gpu_cg_vs_cpu PRIVATE
benchmark::benchmark benchmark::benchmark_main
CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
target_compile_options(bench_gpu_cg_vs_cpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
target_compile_definitions(bench_gpu_cg_vs_cpu PRIVATE EIGEN_USE_GPU)
# Bundle Adjustment benchmark: GPU CG vs CPU CG on real BAL datasets.
add_executable(bench_gpu_ba bench_gpu_ba.cu)
target_include_directories(bench_gpu_ba PRIVATE
${EIGEN_SOURCE_DIR}
${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(bench_gpu_ba PRIVATE
benchmark::benchmark
CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
target_compile_options(bench_gpu_ba PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
target_compile_definitions(bench_gpu_ba PRIVATE EIGEN_USE_GPU)

View File

@@ -0,0 +1,149 @@
# Bundle Adjustment: GPU CG vs CPU CG Results
Benchmark of Eigen's GPU CG pipeline on normal equations arising from bundle
adjustment (BAL datasets). Compares CPU `ConjugateGradient` (Jacobi preconditioner)
against GPU CG using `DeviceMatrix` + `GpuSparseContext` + `DeviceScalar`.
## Hardware
- **CPU**: Intel Core i7-13700HX (Raptor Lake, 12 cores / 24 threads, single thread for Eigen CG)
- **GPU**: NVIDIA GeForce RTX 4070 Laptop GPU (Ada Lovelace, 4608 CUDA cores, 8 GB GDDR6)
- **CUDA**: 13.2 / Driver 595.79
- **OS**: Ubuntu 24.04 (WSL2, kernel 6.6.87)
## Software
- Eigen: `eigen-gpu-cg` branch
- Google Benchmark 1.9.1
- Compiler: nvcc 13.2 + g++ 13.3
- Normal equations: H = J^T*J + I (Levenberg-Marquardt damping lambda=1.0)
- CG tolerance: 1e-8, max iterations: 10000
## Method
For each BAL problem file:
1. Parse the BAL file (cameras, 3D points, 2D observations)
2. Compute the full Jacobian J using the BAL camera model (Rodrigues rotation +
perspective projection + radial distortion) with central finite differences
3. Form the normal equations H = J^T*J + lambda*I (sparse, symmetric positive definite)
4. Solve H*dx = -J^T*r using CG with Jacobi preconditioner on CPU and GPU
5. Report wall-clock time (mean of 3 repetitions)
GPU CG uses: `GpuSparseContext` for SpMV, `DeviceMatrix` for vectors,
`DeviceScalar` with `CUBLAS_POINTER_MODE_DEVICE` for dot/norm reductions,
in-place `cwiseProduct` via NPP for Jacobi preconditioner application,
device-pointer-mode `scal` to avoid host sync on the beta update.
## Results
### Summary table
| Dataset | Cameras | Points | Obs | H size | H nnz | CG iters | CPU CG (ms) | GPU CG (ms) | Speedup |
|---------|---------|--------|-----|--------|-------|----------|-------------|-------------|---------|
| Ladybug-49 | 49 | 7,776 | 31,843 | 23,769 | 1.8M | 4,421 | 4,006 | 1,152 | **3.5x** |
| Ladybug-138 | 138 | 19,878 | 85,217 | 60,876 | 4.8M | 7,008 | 21,498 | 3,553 | **6.1x** |
| Ladybug-646 | 646 | 73,584 | 327,297 | 226,566 | 18.4M | 10,000* | 123,727 | 14,268 | **8.7x** |
| Dubrovnik-356 | 356 | 226,730 | 1,255,268 | 683,394 | 69.8M | 4,308 | 216,149 | 24,493 | **8.8x** |
\* Hit 10,000 iteration cap (poorly conditioned problem). Both CPU and GPU
hit the same cap, so timing comparison remains valid.
### Profile breakdown (Ladybug-138, nsys)
GPU kernel time is dominated by SpMV (91%). The remaining 9% is BLAS-1
operations (dot, axpy, scal) and NPP element-wise ops (cwiseProduct).
| Kernel | Time (ms) | % | Calls |
|--------|-----------|---|-------|
| cuSPARSE csrmv (SpMV) | 2507 | 91.3% | 7,006 |
| cuBLAS dot | 92 | 3.4% | 21,020 |
| cuBLAS axpy (device ptr) | 27 | 1.0% | 14,012 |
| cuSPARSE partition | 19 | 0.7% | 7,006 |
| NPP cwiseProduct | 16 + 13 | 1.1% | 14,011 + 7,006 |
| cuBLAS axpy (host ptr) | 12 | 0.5% | 7,005 |
| cuBLAS scal (device ptr) | 11 | 0.4% | 7,005 |
| NPP scalar ops | 7 | 0.2% | 7,006 |
### Optimizations applied
Three profiling-driven optimizations reduced GPU CG time by **1.8x**
(6.5s → 3.6s on Ladybug-138):
1. **In-place `cwiseProduct`**: The Jacobi preconditioner apply
(`z = invdiag .* residual`) was allocating a new DeviceMatrix every
iteration. Added `z.cwiseProduct(ctx, a, b)` that reuses `z`'s buffer.
Reduced `cudaMalloc` calls from 7,053 to 23 (saving 2.3s).
2. **`squaredNorm` via `dot(x,x)`**: cuBLAS `nrm2` uses a numerically
careful scaled-sum-of-squares algorithm (29µs/call). Replaced with
`dot(x,x)` (6.4µs/call) — 4.5x faster per call, saving ~320ms.
3. **Device-pointer `scal`**: `p *= beta` was converting `DeviceScalar`
beta to host (triggering a stream sync), then calling host-pointer-mode
scal. Added `operator*=(DeviceScalar)` that uses device-pointer-mode
scal, eliminating one sync per iteration. Halved `cudaStreamSynchronize`
calls from 14K to 7K.
### Observations
1. **GPU speedup scales with problem size**: from 3.5x on small problems
(24K variables) to 8.8x on large problems (683K variables). This is
expected — larger problems have more parallelism for the GPU to exploit.
2. **Iteration counts match**: CPU and GPU CG converge in the same number
of iterations (within 1%), confirming numerical equivalence.
3. **Bottleneck is SpMV**: CG iteration time is dominated (91%) by the
sparse matrix-vector product on H. Further speedup requires either
faster SpMV (e.g., block-sparse formats) or algorithmic improvements
(Schur complement, better preconditioners).
4. **Remaining overhead**: CUDA API calls (cudaMemcpyAsync for 8-byte
DeviceScalar transfers) account for ~50% of non-kernel time. Batching
multiple scalar reductions into a single transfer would help.
5. **Jacobi preconditioner is weak for BA**: The Ladybug-646 problem does
not converge in 10K iterations. Ceres uses block Jacobi or Schur
complement preconditioners that would also benefit from GPU acceleration.
### Scaling plot data
```
# n nnz_H cpu_ms gpu_ms speedup
23769 1793475 4006 1152 3.48
60876 4791762 21498 3553 6.05
226566 18387948 123727 14268 8.67
683394 69827066 216149 24493 8.82
```
## BAL datasets
Downloaded from http://grail.cs.washington.edu/projects/bal/
| File | Source |
|------|--------|
| problem-49-7776-pre.txt | Ladybug sequence |
| problem-138-19878-pre.txt | Ladybug sequence |
| problem-646-73584-pre.txt | Ladybug sequence |
| problem-356-226730-pre.txt | Dubrovnik reconstruction |
## Reproducing
```bash
# Build
cmake -G Ninja -B build-bench-gpu -S benchmarks/GPU -DCMAKE_CUDA_ARCHITECTURES=89
cmake --build build-bench-gpu --target bench_gpu_ba
# Download BAL datasets
wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-49-7776-pre.txt.bz2
wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-138-19878-pre.txt.bz2
wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-646-73584-pre.txt.bz2
wget http://grail.cs.washington.edu/projects/bal/data/dubrovnik/problem-356-226730-pre.txt.bz2
bunzip2 *.bz2
# Run (one at a time)
BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
BAL_FILE=problem-138-19878-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
BAL_FILE=problem-646-73584-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
BAL_FILE=problem-356-226730-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
```

View File

@@ -0,0 +1,533 @@
// Bundle Adjustment benchmark: GPU CG vs CPU CG on real BAL datasets.
//
// Tests Eigen's GPU CG pipeline (DeviceMatrix + GpuSparseContext + DeviceScalar)
// on the normal equations (J^T*J) arising from bundle adjustment problems.
//
// Reads a BAL (Bundle Adjustment in the Large) format file, computes the
// Jacobian and residual, forms the normal equations H = J^T*J + lambda*I,
// then solves H*dx = -J^T*r with both CPU and GPU conjugate gradients.
//
// BAL format: http://grail.cs.washington.edu/projects/bal/
//
// Usage:
// cmake --build build-bench-gpu --target bench_gpu_ba
//
// # Download a BAL dataset (bz2-compressed):
// wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-49-7776-pre.txt.bz2
// bunzip2 problem-49-7776-pre.txt.bz2
//
// # Run on a specific problem:
// BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba
//
// # Append results to the log:
// BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba \
// --benchmark_format=console 2>&1 | tee -a benchmarks/GPU/ba_results.log
#include <benchmark/benchmark.h>
#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
#include <Eigen/GPU>
#include <cmath>
#include <cstdio>
#include <fstream>
#include <string>
#include <vector>
using namespace Eigen;
// ============================================================================
// BAL problem data
// ============================================================================
struct BALProblem {
int num_cameras = 0;
int num_points = 0;
int num_observations = 0;
// Observations: (camera_idx, point_idx, observed_x, observed_y).
std::vector<int> camera_index;
std::vector<int> point_index;
std::vector<double> observations_x;
std::vector<double> observations_y;
// Camera parameters: 9 per camera (Rodrigues r[3], translation t[3], f, k1, k2).
std::vector<double> cameras; // [num_cameras * 9]
// 3D points: 3 per point.
std::vector<double> points; // [num_points * 3]
const double* camera(int i) const { return &cameras[i * 9]; }
const double* point(int i) const { return &points[i * 3]; }
bool load(const std::string& filename) {
std::ifstream in(filename);
if (!in) {
fprintf(stderr, "ERROR: Cannot open BAL file: %s\n", filename.c_str());
return false;
}
in >> num_cameras >> num_points >> num_observations;
if (!in || num_cameras <= 0 || num_points <= 0 || num_observations <= 0) {
fprintf(stderr, "ERROR: Invalid BAL header in %s\n", filename.c_str());
return false;
}
camera_index.resize(num_observations);
point_index.resize(num_observations);
observations_x.resize(num_observations);
observations_y.resize(num_observations);
for (int i = 0; i < num_observations; ++i) {
in >> camera_index[i] >> point_index[i] >> observations_x[i] >> observations_y[i];
}
cameras.resize(num_cameras * 9);
for (int i = 0; i < num_cameras * 9; ++i) {
in >> cameras[i];
}
points.resize(num_points * 3);
for (int i = 0; i < num_points * 3; ++i) {
in >> points[i];
}
if (!in) {
fprintf(stderr, "ERROR: Truncated BAL file: %s\n", filename.c_str());
return false;
}
fprintf(stderr, "Loaded BAL: %d cameras, %d points, %d observations\n", num_cameras, num_points, num_observations);
return true;
}
};
// ============================================================================
// Camera projection model (BAL convention)
// ============================================================================
// Rodrigues rotation: rotate point X by axis-angle vector omega.
static void rodrigues_rotate(const double* omega, const double* X, double* result) {
double theta2 = omega[0] * omega[0] + omega[1] * omega[1] + omega[2] * omega[2];
if (theta2 > 1e-30) {
double theta = std::sqrt(theta2);
double costh = std::cos(theta);
double sinth = std::sin(theta);
double k = (1.0 - costh) / theta2;
// Cross product omega x X.
double wx = omega[1] * X[2] - omega[2] * X[1];
double wy = omega[2] * X[0] - omega[0] * X[2];
double wz = omega[0] * X[1] - omega[1] * X[0];
// Dot product omega . X.
double dot = omega[0] * X[0] + omega[1] * X[1] + omega[2] * X[2];
result[0] = X[0] * costh + wx * (sinth / theta) + omega[0] * dot * k;
result[1] = X[1] * costh + wy * (sinth / theta) + omega[1] * dot * k;
result[2] = X[2] * costh + wz * (sinth / theta) + omega[2] * dot * k;
} else {
// Small angle: R ≈ I + [omega]×.
result[0] = X[0] + omega[1] * X[2] - omega[2] * X[1];
result[1] = X[1] + omega[2] * X[0] - omega[0] * X[2];
result[2] = X[2] + omega[0] * X[1] - omega[1] * X[0];
}
}
// Project a 3D point through a camera, returning the 2D residual.
// camera: [r0,r1,r2, t0,t1,t2, f, k1, k2]
// point: [X, Y, Z]
// observed: [ox, oy]
// residual: [rx, ry] = projected - observed
static void project(const double* camera, const double* point, const double* observed, double* residual) {
// Rotate.
double P[3];
rodrigues_rotate(camera, point, P);
// Translate.
P[0] += camera[3];
P[1] += camera[4];
P[2] += camera[5];
// Normalize (BAL convention: negative z).
double xp = -P[0] / P[2];
double yp = -P[1] / P[2];
// Radial distortion.
double r2 = xp * xp + yp * yp;
double distortion = 1.0 + camera[7] * r2 + camera[8] * r2 * r2;
// Apply focal length.
double predicted_x = camera[6] * distortion * xp;
double predicted_y = camera[6] * distortion * yp;
residual[0] = predicted_x - observed[0];
residual[1] = predicted_y - observed[1];
}
// ============================================================================
// Jacobian computation (numerical differentiation)
// ============================================================================
// Compute the 2x9 Jacobian block w.r.t. camera params and 2x3 block w.r.t.
// point coords for a single observation, using central finite differences.
static void compute_jacobian_block(const double* camera, const double* point, const double* observed,
double* J_cam, // 2x9, row-major
double* J_point) // 2x3, row-major
{
constexpr double eps = 1e-8;
// Camera parameters (9).
double cam_pert[9];
std::copy(camera, camera + 9, cam_pert);
for (int j = 0; j < 9; ++j) {
double orig = cam_pert[j];
double rp[2], rm[2];
cam_pert[j] = orig + eps;
project(cam_pert, point, observed, rp);
cam_pert[j] = orig - eps;
project(cam_pert, point, observed, rm);
cam_pert[j] = orig;
J_cam[0 * 9 + j] = (rp[0] - rm[0]) / (2.0 * eps);
J_cam[1 * 9 + j] = (rp[1] - rm[1]) / (2.0 * eps);
}
// Point coordinates (3).
double pt_pert[3];
std::copy(point, point + 3, pt_pert);
for (int j = 0; j < 3; ++j) {
double orig = pt_pert[j];
double rp[2], rm[2];
pt_pert[j] = orig + eps;
project(camera, pt_pert, observed, rp);
pt_pert[j] = orig - eps;
project(camera, pt_pert, observed, rm);
pt_pert[j] = orig;
J_point[0 * 3 + j] = (rp[0] - rm[0]) / (2.0 * eps);
J_point[1 * 3 + j] = (rp[1] - rm[1]) / (2.0 * eps);
}
}
// ============================================================================
// Build normal equations: H = J^T*J + lambda*I, g = -J^T*r
// ============================================================================
struct NormalEquations {
SparseMatrix<double, ColMajor, int> H;
VectorXd g;
VectorXd residual;
double residual_norm;
int jacobian_rows;
int jacobian_cols;
long jacobian_nnz;
};
static NormalEquations build_normal_equations(const BALProblem& problem, double lambda = 1.0) {
const int num_cam_params = problem.num_cameras * 9;
const int num_pt_params = problem.num_points * 3;
const int num_params = num_cam_params + num_pt_params;
const int num_residuals = problem.num_observations * 2;
fprintf(stderr, "Building Jacobian: %d x %d, %ld nonzeros\n", num_residuals, num_params,
(long)problem.num_observations * 24);
// Build J as a triplet list.
using Triplet = Eigen::Triplet<double>;
std::vector<Triplet> triplets;
triplets.reserve(problem.num_observations * 24); // 2 rows × 12 nonzeros = 24 entries per obs
VectorXd residual(num_residuals);
for (int obs = 0; obs < problem.num_observations; ++obs) {
int ci = problem.camera_index[obs];
int pi = problem.point_index[obs];
double observed[2] = {problem.observations_x[obs], problem.observations_y[obs]};
// Compute residual.
double r[2];
project(problem.camera(ci), problem.point(pi), observed, r);
residual[obs * 2 + 0] = r[0];
residual[obs * 2 + 1] = r[1];
// Compute Jacobian blocks.
double J_cam[18], J_pt[6]; // 2x9 and 2x3
compute_jacobian_block(problem.camera(ci), problem.point(pi), observed, J_cam, J_pt);
// Insert camera block: rows [2*obs, 2*obs+1], cols [9*ci, 9*ci+8].
for (int row = 0; row < 2; ++row) {
for (int col = 0; col < 9; ++col) {
double val = J_cam[row * 9 + col];
if (val != 0.0) {
triplets.emplace_back(obs * 2 + row, ci * 9 + col, val);
}
}
}
// Insert point block: rows [2*obs, 2*obs+1], cols [num_cam_params + 3*pi, ...].
for (int row = 0; row < 2; ++row) {
for (int col = 0; col < 3; ++col) {
double val = J_pt[row * 3 + col];
if (val != 0.0) {
triplets.emplace_back(obs * 2 + row, num_cam_params + pi * 3 + col, val);
}
}
}
}
// Build sparse Jacobian.
SparseMatrix<double, ColMajor, int> J(num_residuals, num_params);
J.setFromTriplets(triplets.begin(), triplets.end());
fprintf(stderr, "Jacobian: %dx%d, nnz=%ld\n", (int)J.rows(), (int)J.cols(), (long)J.nonZeros());
// Form normal equations: H = J^T*J + lambda*I.
SparseMatrix<double, ColMajor, int> H = (J.transpose() * J).pruned();
// Add Levenberg-Marquardt damping.
for (int i = 0; i < num_params; ++i) {
H.coeffRef(i, i) += lambda;
}
H.makeCompressed();
// Gradient: g = -J^T * r.
VectorXd g = -(J.transpose() * residual);
double rnorm = residual.norm();
fprintf(stderr, "Normal equations: H is %dx%d, nnz=%ld, |r|=%.6e\n", (int)H.rows(), (int)H.cols(), (long)H.nonZeros(),
rnorm);
return {std::move(H), std::move(g), std::move(residual), rnorm, num_residuals, num_params, (long)J.nonZeros()};
}
// ============================================================================
// Global problem state (loaded once before benchmarks run)
// ============================================================================
static BALProblem g_problem;
static NormalEquations g_neq;
static bool g_loaded = false;
static void ensure_loaded() {
if (g_loaded) return;
const char* bal_file = std::getenv("BAL_FILE");
if (!bal_file) {
fprintf(stderr,
"ERROR: Set BAL_FILE environment variable to a BAL problem file.\n"
" Download from: http://grail.cs.washington.edu/projects/bal/\n"
" Example:\n"
" wget http://grail.cs.washington.edu/projects/bal/data/ladybug/"
"problem-49-7776-pre.txt.bz2\n"
" bunzip2 problem-49-7776-pre.txt.bz2\n"
" BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba\n");
std::exit(1);
}
if (!g_problem.load(bal_file)) {
std::exit(1);
}
g_neq = build_normal_equations(g_problem);
g_loaded = true;
}
// ============================================================================
// CPU CG benchmark
// ============================================================================
static void BM_BA_CPU_CG(benchmark::State& state) {
ensure_loaded();
const auto& H = g_neq.H;
const auto& g = g_neq.g;
ConjugateGradient<SparseMatrix<double, ColMajor, int>, Lower | Upper> cg;
cg.setMaxIterations(10000);
cg.setTolerance(1e-8);
cg.compute(H);
int last_iters = 0;
double last_error = 0;
for (auto _ : state) {
VectorXd dx = cg.solve(g);
benchmark::DoNotOptimize(dx.data());
last_iters = cg.iterations();
last_error = cg.error();
}
state.counters["n"] = H.rows();
state.counters["nnz"] = H.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
state.counters["cameras"] = g_problem.num_cameras;
state.counters["points"] = g_problem.num_points;
state.counters["observations"] = g_problem.num_observations;
}
// ============================================================================
// GPU CG benchmark (with Jacobi preconditioner)
// ============================================================================
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 1);
cudaFree(p);
done = true;
}
}
static void BM_BA_GPU_CG(benchmark::State& state) {
ensure_loaded();
cuda_warmup();
const auto& H = g_neq.H;
const auto& g = g_neq.g;
const Index n = H.rows();
// Extract inverse diagonal (Jacobi preconditioner).
using SpMat = SparseMatrix<double, ColMajor, int>;
VectorXd invdiag(n);
for (Index j = 0; j < H.outerSize(); ++j) {
SpMat::InnerIterator it(H, j);
while (it && it.index() != j) ++it;
if (it && it.index() == j && it.value() != 0.0)
invdiag(j) = 1.0 / it.value();
else
invdiag(j) = 1.0;
}
// Set up GPU context and upload data.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<double> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(H);
auto d_invdiag = DeviceMatrix<double>::fromHost(invdiag, ctx.stream());
auto d_g = DeviceMatrix<double>::fromHost(g, ctx.stream());
int last_iters = 0;
double last_error = 0;
for (auto _ : state) {
DeviceMatrix<double> d_x(n, 1);
d_x.setZero(ctx);
DeviceMatrix<double> residual(n, 1);
residual.copyFrom(ctx, d_g);
double rhsNorm2 = d_g.squaredNorm(ctx);
double threshold = 1e-8 * 1e-8 * rhsNorm2;
double residualNorm2 = residual.squaredNorm(ctx);
DeviceMatrix<double> p = d_invdiag.cwiseProduct(ctx, residual);
DeviceMatrix<double> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index i = 0;
Index maxIters = 10000;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
z.cwiseProduct(ctx, d_invdiag, residual); // in-place, no allocation
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= beta; // device-pointer scal, no host sync
p += z;
i++;
}
benchmark::DoNotOptimize(d_x.data());
last_iters = i;
last_error = std::sqrt(residualNorm2 / rhsNorm2);
}
GpuContext::setThreadLocal(nullptr);
state.counters["n"] = n;
state.counters["nnz"] = H.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
state.counters["cameras"] = g_problem.num_cameras;
state.counters["points"] = g_problem.num_points;
state.counters["observations"] = g_problem.num_observations;
}
// ============================================================================
// CPU CG with Jacobi preconditioner (apples-to-apples comparison)
// ============================================================================
static void BM_BA_CPU_CG_Jacobi(benchmark::State& state) {
ensure_loaded();
const auto& H = g_neq.H;
const auto& g = g_neq.g;
// Eigen's DiagonalPreconditioner is effectively Jacobi.
ConjugateGradient<SparseMatrix<double, ColMajor, int>, Lower | Upper> cg;
cg.setMaxIterations(10000);
cg.setTolerance(1e-8);
cg.compute(H);
int last_iters = 0;
double last_error = 0;
for (auto _ : state) {
VectorXd dx = cg.solve(g);
benchmark::DoNotOptimize(dx.data());
last_iters = cg.iterations();
last_error = cg.error();
}
state.counters["n"] = H.rows();
state.counters["nnz"] = H.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
}
// ============================================================================
// Register benchmarks
// ============================================================================
BENCHMARK(BM_BA_CPU_CG)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_BA_CPU_CG_Jacobi)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_BA_GPU_CG)->Unit(benchmark::kMillisecond);
// ============================================================================
// Custom main: print summary after benchmarks
// ============================================================================
int main(int argc, char** argv) {
benchmark::Initialize(&argc, argv);
// Print problem info before benchmarks.
const char* bal_file = std::getenv("BAL_FILE");
if (bal_file) {
ensure_loaded();
fprintf(stderr,
"\n"
"=== Bundle Adjustment GPU CG Benchmark ===\n"
"BAL file: %s\n"
"Cameras: %d\n"
"Points: %d\n"
"Observations: %d\n"
"J size: %d x %d, nnz=%ld\n"
"H size: %d x %d, nnz=%ld\n"
"|residual|: %.6e\n"
"==========================================\n\n",
bal_file, g_problem.num_cameras, g_problem.num_points, g_problem.num_observations, g_neq.jacobian_rows,
g_neq.jacobian_cols, g_neq.jacobian_nnz, (int)g_neq.H.rows(), (int)g_neq.H.cols(), (long)g_neq.H.nonZeros(),
g_neq.residual_norm);
}
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();
return 0;
}

View File

@@ -0,0 +1,291 @@
// Benchmark: GPU Conjugate Gradient via DeviceMatrix operators.
//
// Shows the path to running Eigen's CG on GPU with minimal code changes.
// The DeviceMatrix benchmark mirrors Eigen's conjugate_gradient() line-by-line.
// A raw cuBLAS device-pointer-mode implementation is included as a lower bound.
//
// The only change needed in Eigen's CG template to support DeviceMatrix:
// Line 34: typedef Dest VectorType; (instead of Matrix<Scalar, Dynamic, 1>)
//
// Usage:
// cmake --build build-bench-gpu --target bench_gpu_cg_sync
// ./build-bench-gpu/bench_gpu_cg_sync
#include <benchmark/benchmark.h>
#include <Eigen/Sparse>
#include <Eigen/GPU>
#include <cusparse.h>
using namespace Eigen;
using Scalar = double;
using RealScalar = double;
using Vec = Matrix<Scalar, Dynamic, 1>;
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
static SpMat make_spd(Index n) {
SpMat A(n, n);
A.reserve(VectorXi::Constant(n, 3));
for (Index i = 0; i < n; ++i) {
A.insert(i, i) = 4.0;
if (i > 0) A.insert(i, i - 1) = -1.0;
if (i < n - 1) A.insert(i, i + 1) = -1.0;
}
A.makeCompressed();
return A;
}
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 1);
cudaFree(p);
done = true;
}
}
// ==========================================================================
// GPU CG using DeviceMatrix operators — mirrors Eigen's conjugate_gradient()
// ==========================================================================
//
// Compare with Eigen/src/IterativeLinearSolvers/ConjugateGradient.h lines 29-84.
// Left column: Eigen CG code. Right column: this benchmark.
//
// Eigen CG GPU CG (this benchmark)
// -------- -----------------------
// VectorType residual = rhs - mat * x; residual.copyFrom(ctx, rhs); [x=0 so r=b]
// RealScalar rhsNorm2 = rhs.sqNorm(); RealScalar rhsNorm2 = rhs.squaredNorm();
// ...
// tmp.noalias() = mat * p; tmp.noalias() = mat * p; [identical]
// Scalar alpha = absNew / p.dot(tmp); Scalar alpha = absNew / p.dot(tmp); [identical]
// x += alpha * p; x += alpha * p; [identical]
// residual -= alpha * tmp; residual -= alpha * tmp; [identical]
// residualNorm2 = residual.sqNorm(); residualNorm2 = residual.squaredNorm(); [identical]
// ...
// p = z + beta * p; p *= beta; p += z; [equivalent, no alloc]
static void BM_CG_DeviceMatrixOps(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
SpMat A = make_spd(n);
Vec b = Vec::Random(n);
// One shared context: SpMV + BLAS-1 on same stream, zero event overhead.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<Scalar> spmv(ctx);
auto mat = spmv.deviceView(A);
// Upload RHS once.
auto rhs = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
for (auto _ : state) {
// --- Eigen CG lines 34-63: initialization ---
// typedef Dest VectorType; // GPU CHANGE: was Matrix<Scalar,Dynamic,1>
// VectorType residual = rhs - mat * x; // x=0, so residual = rhs
DeviceMatrix<Scalar> x(n, 1);
x.setZero();
DeviceMatrix<Scalar> residual(n, 1);
residual.copyFrom(ctx, rhs);
// RealScalar rhsNorm2 = rhs.squaredNorm();
RealScalar rhsNorm2 = rhs.squaredNorm();
if (rhsNorm2 == 0) continue;
RealScalar tol = 1e-10;
const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
RealScalar threshold = numext::maxi(RealScalar(tol * tol * rhsNorm2), considerAsZero);
// RealScalar residualNorm2 = residual.squaredNorm();
RealScalar residualNorm2 = residual.squaredNorm();
if (residualNorm2 < threshold) continue;
// VectorType p(n);
// p = precond.solve(residual); // no preconditioner: p = residual
DeviceMatrix<Scalar> p(n, 1);
p.copyFrom(ctx, residual);
// VectorType z(n), tmp(n);
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
// auto absNew = numext::real(residual.dot(p));
// DeviceScalar — stays on device, no sync.
auto absNew = residual.dot(p); // DeviceScalar, no sync
// while (i < maxIters) {
Index maxIters = 200;
Index i = 0;
while (i < maxIters) {
// tmp.noalias() = mat * p;
tmp.noalias() = mat * p; // SpMV, device-resident
// auto alpha = absNew / p.dot(tmp);
// DeviceScalar / DeviceScalar → device kernel, no sync!
auto alpha = absNew / p.dot(tmp); // DeviceScalar, no sync
// x += alpha * p;
// DeviceScalar * DeviceMatrix → device-pointer axpy, no sync!
x += alpha * p;
// residual -= alpha * tmp;
residual -= alpha * tmp; // device-pointer axpy, no sync
// residualNorm2 = residual.squaredNorm();
residualNorm2 = residual.squaredNorm(); // THE one sync per iteration
// if (residualNorm2 < threshold) break;
if (residualNorm2 < threshold) break;
// z = precond.solve(residual);
z.copyFrom(ctx, residual); // no preconditioner
// auto absOld = std::move(absNew);
auto absOld = std::move(absNew); // no sync, no alloc
// absNew = numext::real(residual.dot(z));
absNew = residual.dot(z); // DeviceScalar, no sync
// auto beta = absNew / absOld;
// DeviceScalar / DeviceScalar → device kernel, no sync!
auto beta = absNew / absOld; // DeviceScalar, no sync
// p = z + beta * p;
p *= beta; // device-pointer scal, no host sync
p += z;
i++;
}
}
GpuContext::setThreadLocal(nullptr);
state.SetItemsProcessed(state.iterations() * 200);
}
BENCHMARK(BM_CG_DeviceMatrixOps)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
// ==========================================================================
// Raw cuBLAS device-pointer-mode CG (1 sync/iter) — performance lower bound
// ==========================================================================
__global__ void scalar_div_kernel(const Scalar* a, const Scalar* b, Scalar* out) { *out = *a / *b; }
__global__ void scalar_neg_kernel(const Scalar* in, Scalar* out) { *out = -(*in); }
static void BM_CG_DevicePointerMode(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
const int maxIters = 200;
SpMat A = make_spd(n);
Vec b = Vec::Random(n);
cudaStream_t stream;
cudaStreamCreate(&stream);
cublasHandle_t cublas;
cublasCreate(&cublas);
cublasSetStream(cublas, stream);
cusparseHandle_t cusparse;
cusparseCreate(&cusparse);
cusparseSetStream(cusparse, stream);
internal::DeviceBuffer d_outer((n + 1) * sizeof(int));
internal::DeviceBuffer d_inner(A.nonZeros() * sizeof(int));
internal::DeviceBuffer d_vals(A.nonZeros() * sizeof(Scalar));
cudaMemcpy(d_outer.ptr, A.outerIndexPtr(), (n + 1) * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_inner.ptr, A.innerIndexPtr(), A.nonZeros() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_vals.ptr, A.valuePtr(), A.nonZeros() * sizeof(Scalar), cudaMemcpyHostToDevice);
cusparseSpMatDescr_t matA;
cusparseCreateCsc(&matA, n, n, A.nonZeros(), d_outer.ptr, d_inner.ptr, d_vals.ptr, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
internal::DeviceBuffer d_tmp_buf(n * sizeof(Scalar));
cusparseDnVecDescr_t tmp_x, tmp_y;
cusparseCreateDnVec(&tmp_x, n, d_tmp_buf.ptr, CUDA_R_64F);
cusparseCreateDnVec(&tmp_y, n, d_tmp_buf.ptr, CUDA_R_64F);
Scalar spmv_alpha = 1.0, spmv_beta = 0.0;
size_t ws_size = 0;
cusparseSpMV_bufferSize(cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &spmv_alpha, matA, tmp_x, &spmv_beta, tmp_y,
CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &ws_size);
internal::DeviceBuffer d_workspace(ws_size);
cusparseDestroyDnVec(tmp_x);
cusparseDestroyDnVec(tmp_y);
internal::DeviceBuffer d_x(n * sizeof(Scalar)), d_r(n * sizeof(Scalar));
internal::DeviceBuffer d_p(n * sizeof(Scalar)), d_tmp(n * sizeof(Scalar));
internal::DeviceBuffer d_b(n * sizeof(Scalar));
internal::DeviceBuffer d_absNew(sizeof(Scalar)), d_absOld(sizeof(Scalar));
internal::DeviceBuffer d_pdot(sizeof(Scalar)), d_alpha(sizeof(Scalar));
internal::DeviceBuffer d_neg_alpha(sizeof(Scalar)), d_beta(sizeof(Scalar));
internal::DeviceBuffer d_rnorm(sizeof(RealScalar));
cudaMemcpy(d_b.ptr, b.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice);
auto spmv = [&](Scalar* x_ptr, Scalar* y_ptr) {
cusparseDnVecDescr_t vx, vy;
cusparseCreateDnVec(&vx, n, x_ptr, CUDA_R_64F);
cusparseCreateDnVec(&vy, n, y_ptr, CUDA_R_64F);
cusparseSpMV(cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &spmv_alpha, matA, vx, &spmv_beta, vy, CUDA_R_64F,
CUSPARSE_SPMV_ALG_DEFAULT, d_workspace.ptr);
cusparseDestroyDnVec(vx);
cusparseDestroyDnVec(vy);
};
for (auto _ : state) {
cudaMemsetAsync(static_cast<Scalar*>(d_x.ptr), 0, n * sizeof(Scalar), stream);
cudaMemcpyAsync(d_r.ptr, d_b.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
cudaMemcpyAsync(d_p.ptr, d_b.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_DEVICE);
cublasDdot(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_p.ptr), 1,
static_cast<Scalar*>(d_absNew.ptr));
for (int i = 0; i < maxIters; ++i) {
spmv(static_cast<Scalar*>(d_p.ptr), static_cast<Scalar*>(d_tmp.ptr));
cublasDdot(cublas, n, static_cast<Scalar*>(d_p.ptr), 1, static_cast<Scalar*>(d_tmp.ptr), 1,
static_cast<Scalar*>(d_pdot.ptr));
scalar_div_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_absNew.ptr), static_cast<Scalar*>(d_pdot.ptr),
static_cast<Scalar*>(d_alpha.ptr));
scalar_neg_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_alpha.ptr), static_cast<Scalar*>(d_neg_alpha.ptr));
cublasDaxpy(cublas, n, static_cast<Scalar*>(d_alpha.ptr), static_cast<Scalar*>(d_p.ptr), 1,
static_cast<Scalar*>(d_x.ptr), 1);
cublasDaxpy(cublas, n, static_cast<Scalar*>(d_neg_alpha.ptr), static_cast<Scalar*>(d_tmp.ptr), 1,
static_cast<Scalar*>(d_r.ptr), 1);
cublasDnrm2(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<RealScalar*>(d_rnorm.ptr));
RealScalar rnorm;
cudaMemcpyAsync(&rnorm, d_rnorm.ptr, sizeof(RealScalar), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
if (rnorm * rnorm < 1e-20) break;
cudaMemcpyAsync(d_absOld.ptr, d_absNew.ptr, sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
cublasDdot(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_r.ptr), 1,
static_cast<Scalar*>(d_absNew.ptr));
scalar_div_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_absNew.ptr), static_cast<Scalar*>(d_absOld.ptr),
static_cast<Scalar*>(d_beta.ptr));
cublasDscal(cublas, n, static_cast<Scalar*>(d_beta.ptr), static_cast<Scalar*>(d_p.ptr), 1);
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_HOST);
Scalar one = 1.0;
cublasDaxpy(cublas, n, &one, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_p.ptr), 1);
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_DEVICE);
}
cudaStreamSynchronize(stream);
}
state.SetItemsProcessed(state.iterations() * maxIters);
cusparseDestroySpMat(matA);
cusparseDestroy(cusparse);
cublasDestroy(cublas);
cudaStreamDestroy(stream);
}
BENCHMARK(BM_CG_DevicePointerMode)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);

View File

@@ -0,0 +1,216 @@
// Benchmark: GPU CG vs CPU CG on realistic sparse systems.
//
// Tests 2D Laplacian (5-point stencil) and 3D Laplacian (7-point stencil)
// in both float and double precision.
//
// Usage:
// cmake --build build-bench-gpu --target bench_gpu_cg_vs_cpu
// ./build-bench-gpu/bench_gpu_cg_vs_cpu
#include <benchmark/benchmark.h>
#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Sparse matrix generators -----------------------------------------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_laplacian_2d(int grid_n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
const int n = grid_n * grid_n;
SpMat A(n, n);
A.reserve(VectorXi::Constant(n, 5));
for (int i = 0; i < grid_n; ++i) {
for (int j = 0; j < grid_n; ++j) {
int idx = i * grid_n + j;
A.insert(idx, idx) = Scalar(4);
if (i > 0) A.insert(idx, idx - grid_n) = Scalar(-1);
if (i < grid_n - 1) A.insert(idx, idx + grid_n) = Scalar(-1);
if (j > 0) A.insert(idx, idx - 1) = Scalar(-1);
if (j < grid_n - 1) A.insert(idx, idx + 1) = Scalar(-1);
}
}
A.makeCompressed();
return A;
}
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_laplacian_3d(int grid_n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
const int n = grid_n * grid_n * grid_n;
const int n2 = grid_n * grid_n;
SpMat A(n, n);
A.reserve(VectorXi::Constant(n, 7));
for (int i = 0; i < grid_n; ++i) {
for (int j = 0; j < grid_n; ++j) {
for (int k = 0; k < grid_n; ++k) {
int idx = i * n2 + j * grid_n + k;
A.insert(idx, idx) = Scalar(6);
if (i > 0) A.insert(idx, idx - n2) = Scalar(-1);
if (i < grid_n - 1) A.insert(idx, idx + n2) = Scalar(-1);
if (j > 0) A.insert(idx, idx - grid_n) = Scalar(-1);
if (j < grid_n - 1) A.insert(idx, idx + grid_n) = Scalar(-1);
if (k > 0) A.insert(idx, idx - 1) = Scalar(-1);
if (k < grid_n - 1) A.insert(idx, idx + 1) = Scalar(-1);
}
}
}
A.makeCompressed();
return A;
}
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 1);
cudaFree(p);
done = true;
}
}
// ---- CPU CG -----------------------------------------------------------------
template <typename Scalar, typename MatGen>
void run_cpu_cg(benchmark::State& state, MatGen make_matrix) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
const int grid_n = state.range(0);
SpMat A = make_matrix(grid_n);
Vec b = Vec::Random(A.rows());
ConjugateGradient<SpMat, Lower | Upper> cg;
cg.setMaxIterations(10000);
cg.setTolerance(RealScalar(1e-8));
cg.compute(A);
int last_iters = 0;
for (auto _ : state) {
Vec x = cg.solve(b);
benchmark::DoNotOptimize(x.data());
last_iters = cg.iterations();
}
state.counters["n"] = A.rows();
state.counters["nnz"] = A.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = cg.error();
}
// ---- GPU CG -----------------------------------------------------------------
template <typename Scalar, typename MatGen>
void run_gpu_cg(benchmark::State& state, MatGen make_matrix) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
cuda_warmup();
const int grid_n = state.range(0);
SpMat A = make_matrix(grid_n);
const Index n = A.rows();
Vec b = Vec::Random(n);
// Extract inverse diagonal.
Vec invdiag(n);
for (Index j = 0; j < A.outerSize(); ++j) {
typename SpMat::InnerIterator it(A, j);
while (it && it.index() != j) ++it;
if (it && it.index() == j && it.value() != Scalar(0))
invdiag(j) = Scalar(1) / it.value();
else
invdiag(j) = Scalar(1);
}
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<Scalar> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(A);
auto d_invdiag = DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
int last_iters = 0;
RealScalar last_error = 0;
for (auto _ : state) {
DeviceMatrix<Scalar> d_x(n, 1);
d_x.setZero(ctx);
DeviceMatrix<Scalar> residual(n, 1);
residual.copyFrom(ctx, d_b);
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
RealScalar tol = RealScalar(1e-8);
RealScalar threshold = tol * tol * rhsNorm2;
RealScalar residualNorm2 = residual.squaredNorm(ctx);
DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index i = 0;
Index maxIters = 10000;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
z.cwiseProduct(ctx, d_invdiag, residual);
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= beta;
p += z;
i++;
}
benchmark::DoNotOptimize(d_x.data());
last_iters = i;
last_error = numext::sqrt(residualNorm2 / rhsNorm2);
}
GpuContext::setThreadLocal(nullptr);
state.counters["n"] = n;
state.counters["nnz"] = A.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
}
// ---- 2D Laplacian, double ---------------------------------------------------
static void BM_CG_CPU_2D_double(benchmark::State& state) { run_cpu_cg<double>(state, make_laplacian_2d<double>); }
static void BM_CG_GPU_2D_double(benchmark::State& state) { run_gpu_cg<double>(state, make_laplacian_2d<double>); }
BENCHMARK(BM_CG_CPU_2D_double)->ArgsProduct({{32, 64, 128, 256, 512}});
BENCHMARK(BM_CG_GPU_2D_double)->ArgsProduct({{32, 64, 128, 256, 512}});
// ---- 2D Laplacian, float ----------------------------------------------------
static void BM_CG_CPU_2D_float(benchmark::State& state) { run_cpu_cg<float>(state, make_laplacian_2d<float>); }
static void BM_CG_GPU_2D_float(benchmark::State& state) { run_gpu_cg<float>(state, make_laplacian_2d<float>); }
BENCHMARK(BM_CG_CPU_2D_float)->ArgsProduct({{32, 64, 128, 256, 512}});
BENCHMARK(BM_CG_GPU_2D_float)->ArgsProduct({{32, 64, 128, 256, 512}});
// ---- 3D Laplacian, double ---------------------------------------------------
static void BM_CG_CPU_3D_double(benchmark::State& state) { run_cpu_cg<double>(state, make_laplacian_3d<double>); }
static void BM_CG_GPU_3D_double(benchmark::State& state) { run_gpu_cg<double>(state, make_laplacian_3d<double>); }
BENCHMARK(BM_CG_CPU_3D_double)->ArgsProduct({{16, 32, 48, 64}});
BENCHMARK(BM_CG_GPU_3D_double)->ArgsProduct({{16, 32, 48, 64}});
// ---- 3D Laplacian, float ----------------------------------------------------
static void BM_CG_CPU_3D_float(benchmark::State& state) { run_cpu_cg<float>(state, make_laplacian_3d<float>); }
static void BM_CG_GPU_3D_float(benchmark::State& state) { run_gpu_cg<float>(state, make_laplacian_3d<float>); }
BENCHMARK(BM_CG_CPU_3D_float)->ArgsProduct({{16, 32, 48, 64}});
BENCHMARK(BM_CG_GPU_3D_float)->ArgsProduct({{16, 32, 48, 64}});

View File

@@ -0,0 +1,185 @@
// GPU FFT benchmarks: GpuFFT 1D and 2D throughput.
//
// Measures forward and inverse FFT performance across a range of sizes,
// including plan-amortized (reuse) and cold-start (new plan) scenarios.
//
// Usage:
// cmake --build build-bench-gpu --target bench_gpu_fft
// ./build-bench-gpu/bench_gpu_fft
//
// Profiling:
// nsys profile --trace=cuda ./build-bench-gpu/bench_gpu_fft
#include <benchmark/benchmark.h>
#include <Eigen/GPU>
using namespace Eigen;
#ifndef SCALAR
#define SCALAR float
#endif
using Scalar = SCALAR;
using Complex = std::complex<Scalar>;
using CVec = Matrix<Complex, Dynamic, 1>;
using RVec = Matrix<Scalar, Dynamic, 1>;
using CMat = Matrix<Complex, Dynamic, Dynamic>;
// CUDA warm-up: ensure the GPU is initialized before timing.
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 1);
cudaFree(p);
done = true;
}
}
// --------------------------------------------------------------------------
// 1D C2C Forward
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_C2C_Fwd(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
CVec x = CVec::Random(n);
GpuFFT<Scalar> fft;
// Warm up plan.
CVec tmp = fft.fwd(x);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.fwd(x));
}
state.SetItemsProcessed(state.iterations() * n);
state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2); // read + write
}
BENCHMARK(BM_GpuFFT_1D_C2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
// --------------------------------------------------------------------------
// 1D C2C Inverse
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_C2C_Inv(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
CVec x = CVec::Random(n);
GpuFFT<Scalar> fft;
CVec X = fft.fwd(x);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.inv(X));
}
state.SetItemsProcessed(state.iterations() * n);
state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);
}
BENCHMARK(BM_GpuFFT_1D_C2C_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
// --------------------------------------------------------------------------
// 1D R2C Forward
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_R2C_Fwd(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
RVec r = RVec::Random(n);
GpuFFT<Scalar> fft;
// Warm up plan.
CVec tmp = fft.fwd(r);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.fwd(r));
}
state.SetItemsProcessed(state.iterations() * n);
state.SetBytesProcessed(state.iterations() * (n * sizeof(Scalar) + (n / 2 + 1) * sizeof(Complex)));
}
BENCHMARK(BM_GpuFFT_1D_R2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
// --------------------------------------------------------------------------
// 1D C2R Inverse
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_C2R_Inv(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
RVec r = RVec::Random(n);
GpuFFT<Scalar> fft;
CVec R = fft.fwd(r);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.invReal(R, n));
}
state.SetItemsProcessed(state.iterations() * n);
state.SetBytesProcessed(state.iterations() * ((n / 2 + 1) * sizeof(Complex) + n * sizeof(Scalar)));
}
BENCHMARK(BM_GpuFFT_1D_C2R_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
// --------------------------------------------------------------------------
// 2D C2C Forward
// --------------------------------------------------------------------------
static void BM_GpuFFT_2D_C2C_Fwd(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0); // square n x n
CMat A = CMat::Random(n, n);
GpuFFT<Scalar> fft;
// Warm up plan.
CMat tmp = fft.fwd2d(A);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.fwd2d(A));
}
state.SetItemsProcessed(state.iterations() * n * n);
state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 2);
}
BENCHMARK(BM_GpuFFT_2D_C2C_Fwd)->RangeMultiplier(2)->Range(64, 4096);
// --------------------------------------------------------------------------
// 2D C2C Roundtrip (fwd + inv)
// --------------------------------------------------------------------------
static void BM_GpuFFT_2D_C2C_Roundtrip(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
CMat A = CMat::Random(n, n);
GpuFFT<Scalar> fft;
// Warm up plans.
CMat tmp = fft.inv2d(fft.fwd2d(A));
for (auto _ : state) {
CMat B = fft.fwd2d(A);
benchmark::DoNotOptimize(fft.inv2d(B));
}
state.SetItemsProcessed(state.iterations() * n * n * 2); // fwd + inv
state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 4);
}
BENCHMARK(BM_GpuFFT_2D_C2C_Roundtrip)->RangeMultiplier(2)->Range(64, 4096);
// --------------------------------------------------------------------------
// 1D Cold start (includes plan creation)
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_ColdStart(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
CVec x = CVec::Random(n);
for (auto _ : state) {
GpuFFT<Scalar> fft; // new object = new plans
benchmark::DoNotOptimize(fft.fwd(x));
}
state.SetItemsProcessed(state.iterations() * n);
}
BENCHMARK(BM_GpuFFT_1D_ColdStart)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);

View File

@@ -481,13 +481,13 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
ei_add_test(gpu_basic) ei_add_test(gpu_basic)
ei_add_test(gpu_library_example "" "CUDA::cusolver") ei_add_test(gpu_library_example "" "CUDA::cusolver")
# DeviceMatrix tests: only CUDA runtime, no NVIDIA libraries. # DeviceMatrix tests: CUDA runtime + cuBLAS + cuSOLVER (for BLAS-1 ops via GpuContext).
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
add_executable(gpu_device_matrix gpu_device_matrix.cpp) add_executable(gpu_device_matrix gpu_device_matrix.cpp)
target_include_directories(gpu_device_matrix PRIVATE target_include_directories(gpu_device_matrix PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include" "${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CMAKE_CURRENT_BINARY_DIR}") "${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart) target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart CUDA::cublas CUDA::cusolver CUDA::npps CUDA::nppc)
target_compile_definitions(gpu_device_matrix PRIVATE target_compile_definitions(gpu_device_matrix PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE} EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1) EIGEN_TEST_PART_ALL=1)
@@ -528,7 +528,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
# compiler and linked against CUDA runtime + cuSOLVER. This avoids NVCC # compiler and linked against CUDA runtime + cuSOLVER. This avoids NVCC
# instantiating Eigen's CPU packet operations for CUDA vector types. # instantiating Eigen's CPU packet operations for CUDA vector types.
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
foreach(_cusolver_test IN ITEMS gpu_cusolver_llt gpu_cusolver_lu) foreach(_cusolver_test IN ITEMS gpu_cusolver_llt gpu_cusolver_lu gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen)
add_executable(${_cusolver_test} ${_cusolver_test}.cpp) add_executable(${_cusolver_test} ${_cusolver_test}.cpp)
target_include_directories(${_cusolver_test} PRIVATE target_include_directories(${_cusolver_test} PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include" "${CUDA_TOOLKIT_ROOT_DIR}/include"
@@ -547,11 +547,103 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
endif() endif()
# cuFFT test (cuFFT is part of the CUDA toolkit — no separate option needed).
if(TARGET CUDA::cufft)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
add_executable(gpu_cufft gpu_cufft.cpp)
target_include_directories(gpu_cufft PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(gpu_cufft
Eigen3::Eigen CUDA::cudart CUDA::cufft CUDA::cublas)
target_compile_definitions(gpu_cufft PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1)
add_test(NAME gpu_cufft COMMAND gpu_cufft)
add_dependencies(buildtests gpu_cufft)
add_dependencies(buildtests_gpu gpu_cufft)
set_property(TEST gpu_cufft APPEND PROPERTY LABELS "Official;gpu")
set_property(TEST gpu_cufft PROPERTY SKIP_RETURN_CODE 77)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
endif()
# cuSPARSE SpMV test (cuSPARSE is part of the CUDA toolkit).
if(TARGET CUDA::cusparse)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
add_executable(gpu_cusparse_spmv gpu_cusparse_spmv.cpp)
target_include_directories(gpu_cusparse_spmv PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(gpu_cusparse_spmv
Eigen3::Eigen CUDA::cudart CUDA::cusparse CUDA::cublas CUDA::cusolver)
target_compile_definitions(gpu_cusparse_spmv PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1)
add_test(NAME gpu_cusparse_spmv COMMAND gpu_cusparse_spmv)
add_dependencies(buildtests gpu_cusparse_spmv)
add_dependencies(buildtests_gpu gpu_cusparse_spmv)
set_property(TEST gpu_cusparse_spmv APPEND PROPERTY LABELS "Official;gpu")
set_property(TEST gpu_cusparse_spmv PROPERTY SKIP_RETURN_CODE 77)
# End-to-end GPU CG test: Eigen's ConjugateGradient with DeviceMatrix.
add_executable(gpu_cg gpu_cg.cpp)
target_include_directories(gpu_cg PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(gpu_cg
Eigen3::Eigen CUDA::cudart CUDA::cusparse CUDA::cublas CUDA::cusolver CUDA::npps CUDA::nppc)
target_compile_definitions(gpu_cg PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1)
add_test(NAME gpu_cg COMMAND gpu_cg)
add_dependencies(buildtests gpu_cg)
add_dependencies(buildtests_gpu gpu_cg)
set_property(TEST gpu_cg APPEND PROPERTY LABELS "Official;gpu")
set_property(TEST gpu_cg PROPERTY SKIP_RETURN_CODE 77)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
endif()
option(EIGEN_TEST_CUSPARSE "Test cuSPARSE integration" OFF) option(EIGEN_TEST_CUSPARSE "Test cuSPARSE integration" OFF)
if(EIGEN_TEST_CUSPARSE AND TARGET CUDA::cusparse) if(EIGEN_TEST_CUSPARSE AND TARGET CUDA::cusparse)
ei_add_test(gpu_cusparse "" "CUDA::cusparse") ei_add_test(gpu_cusparse "" "CUDA::cusparse")
endif() endif()
# cuDSS sparse direct solver tests.
# cuDSS is distributed separately from the CUDA Toolkit.
option(EIGEN_TEST_CUDSS "Test cuDSS sparse solver integration" OFF)
if(EIGEN_TEST_CUDSS)
find_path(CUDSS_INCLUDE_DIR cudss.h
HINTS ${CUDSS_DIR}/include ${CUDA_TOOLKIT_ROOT_DIR}/include /usr/include)
find_library(CUDSS_LIBRARY cudss
HINTS ${CUDSS_DIR}/lib ${CUDSS_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib/x86_64-linux-gnu)
if(CUDSS_INCLUDE_DIR AND CUDSS_LIBRARY)
message(STATUS "cuDSS found: ${CUDSS_LIBRARY}")
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
foreach(_cudss_test IN ITEMS gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu)
add_executable(${_cudss_test} ${_cudss_test}.cpp)
target_include_directories(${_cudss_test} PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CUDSS_INCLUDE_DIR}"
"${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(${_cudss_test}
Eigen3::Eigen CUDA::cudart CUDA::cusolver CUDA::cublas ${CUDSS_LIBRARY})
target_compile_definitions(${_cudss_test} PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1
EIGEN_CUDSS=1)
add_test(NAME ${_cudss_test} COMMAND "${_cudss_test}")
add_dependencies(buildtests ${_cudss_test})
add_dependencies(buildtests_gpu ${_cudss_test})
set_property(TEST ${_cudss_test} APPEND PROPERTY LABELS "Official;gpu")
set_property(TEST ${_cudss_test} PROPERTY SKIP_RETURN_CODE 77)
endforeach()
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
else()
message(WARNING "EIGEN_TEST_CUDSS=ON but cuDSS not found. Set CUDSS_DIR.")
endif()
endif()
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
endif() endif()

224
test/gpu_cg.cpp Normal file
View File

@@ -0,0 +1,224 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// End-to-end test: CG algorithm running on GPU via DeviceMatrix.
//
// Uses DeviceSparseView for SpMV, DeviceMatrix for vectors, DeviceScalar
// for deferred reductions. Verifies correctness against CPU ConjugateGradient.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a sparse SPD matrix --------------------------------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat R(n, n);
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
for (Index j = 0; j < n; ++j) {
for (Index i = 0; i < n; ++i) {
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
}
}
}
R.makeCompressed();
SpMat A = R.adjoint() * R;
for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
A.makeCompressed();
return A;
}
// ---- GPU CG without preconditioner ------------------------------------------
template <typename Scalar>
void test_gpu_cg(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
// CPU reference (identity preconditioner to match GPU).
ConjugateGradient<SpMat, Lower | Upper, IdentityPreconditioner> cpu_cg;
cpu_cg.setMaxIterations(1000);
cpu_cg.setTolerance(RealScalar(1e-8));
cpu_cg.compute(A);
Vec x_cpu = cpu_cg.solve(b);
VERIFY_IS_EQUAL(cpu_cg.info(), Success);
// GPU CG: mirrors Eigen's conjugate_gradient() using DeviceMatrix ops.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<Scalar> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(A);
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
DeviceMatrix<Scalar> d_x(n, 1);
d_x.setZero(ctx);
// r = b (since x=0)
DeviceMatrix<Scalar> residual(n, 1);
residual.copyFrom(ctx, d_b);
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
RealScalar tol = RealScalar(1e-8);
RealScalar threshold = tol * tol * rhsNorm2;
RealScalar residualNorm2 = residual.squaredNorm(ctx);
// p = r (no preconditioner)
DeviceMatrix<Scalar> p(n, 1);
p.copyFrom(ctx, residual);
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index maxIters = 1000;
Index i = 0;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
// z = r (no preconditioner)
z.copyFrom(ctx, residual);
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= Scalar(beta);
p += z;
i++;
}
GpuContext::setThreadLocal(nullptr);
Vec x_gpu = d_x.toHost(ctx.stream());
// Verify residual.
Vec r = A * x_gpu - b;
RealScalar relres = r.norm() / b.norm();
VERIFY(relres < RealScalar(1e-6));
// Compare with CPU.
RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
}
// ---- GPU CG with Jacobi preconditioner --------------------------------------
template <typename Scalar>
void test_gpu_cg_jacobi(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
// CPU reference.
ConjugateGradient<SpMat, Lower | Upper> cpu_cg;
cpu_cg.setMaxIterations(1000);
cpu_cg.setTolerance(RealScalar(1e-8));
cpu_cg.compute(A);
Vec x_cpu = cpu_cg.solve(b);
// Extract inverse diagonal.
Vec invdiag(n);
for (Index j = 0; j < A.outerSize(); ++j) {
typename SpMat::InnerIterator it(A, j);
while (it && it.index() != j) ++it;
if (it && it.index() == j && it.value() != Scalar(0))
invdiag(j) = Scalar(1) / it.value();
else
invdiag(j) = Scalar(1);
}
// GPU CG with Jacobi preconditioner.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<Scalar> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(A);
auto d_invdiag = DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
DeviceMatrix<Scalar> d_x(n, 1);
d_x.setZero(ctx);
DeviceMatrix<Scalar> residual(n, 1);
residual.copyFrom(ctx, d_b);
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
RealScalar tol = RealScalar(1e-8);
RealScalar threshold = tol * tol * rhsNorm2;
RealScalar residualNorm2 = residual.squaredNorm(ctx);
// p = precond.solve(r) = invdiag .* r
DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index maxIters = 1000;
Index i = 0;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
// z = precond.solve(r) = invdiag .* r
z.cwiseProduct(ctx, d_invdiag, residual);
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= beta;
p += z;
i++;
}
GpuContext::setThreadLocal(nullptr);
Vec x_gpu = d_x.toHost(ctx.stream());
Vec r = A * x_gpu - b;
RealScalar relres = r.norm() / b.norm();
VERIFY(relres < RealScalar(1e-6));
RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
}
EIGEN_DECLARE_TEST(gpu_cg) {
CALL_SUBTEST(test_gpu_cg<double>(64));
CALL_SUBTEST(test_gpu_cg<double>(256));
CALL_SUBTEST(test_gpu_cg<float>(64));
CALL_SUBTEST(test_gpu_cg_jacobi<double>(64));
CALL_SUBTEST(test_gpu_cg_jacobi<double>(256));
CALL_SUBTEST(test_gpu_cg_jacobi<float>(64));
}

View File

@@ -16,6 +16,32 @@
using namespace Eigen; using namespace Eigen;
// Unit roundoff for GPU GEMM compute precision.
// TF32 (opt-in via EIGEN_CUDA_TF32) has eps ~ 2^{-10}.
template <typename Scalar>
typename NumTraits<Scalar>::Real gpu_unit_roundoff() {
#if defined(EIGEN_CUDA_TF32) && !defined(EIGEN_NO_CUDA_TENSOR_OPS)
using RealScalar = typename NumTraits<Scalar>::Real;
if (std::is_same<RealScalar, float>::value) return RealScalar(9.8e-4);
#endif
return NumTraits<Scalar>::epsilon();
}
// Higham-Mary probabilistic error bound for GEMM:
// ||C - fl(C)||_F <= lambda * sqrt(k) * u * ||A||_F * ||B||_F
// where k is the inner dimension, u is the unit roundoff, and
// lambda = sqrt(2 * ln(2/delta)) with delta = failure probability.
// lambda = 5 corresponds to delta ~ 10^{-6}.
// Reference: Higham & Mary, "Probabilistic Error Analysis for Inner Products",
// SIAM J. Matrix Anal. Appl., 2019.
template <typename Scalar>
typename NumTraits<Scalar>::Real gemm_error_bound(Index k, typename NumTraits<Scalar>::Real normA,
typename NumTraits<Scalar>::Real normB) {
using RealScalar = typename NumTraits<Scalar>::Real;
constexpr RealScalar lambda = 5;
return lambda * std::sqrt(static_cast<RealScalar>(k)) * gpu_unit_roundoff<Scalar>() * normA * normB;
}
// ---- Basic GEMM: C = A * B ------------------------------------------------- // ---- Basic GEMM: C = A * B -------------------------------------------------
template <typename Scalar> template <typename Scalar>
@@ -36,7 +62,7 @@ void test_gemm_basic(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A * B; Mat C_ref = A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -59,7 +85,7 @@ void test_gemm_adjoint_lhs(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A.adjoint() * B; Mat C_ref = A.adjoint() * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -82,7 +108,7 @@ void test_gemm_transpose_rhs(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A * B.transpose(); Mat C_ref = A * B.transpose();
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -106,7 +132,7 @@ void test_gemm_scaled(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = alpha * A * B; Mat C_ref = alpha * A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -130,7 +156,7 @@ void test_gemm_accumulate(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = C_init + A * B; Mat C_ref = C_init + A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -153,7 +179,7 @@ void test_gemm_accumulate_empty(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A * B; Mat C_ref = A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -178,7 +204,7 @@ void test_gemm_subtract(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = C_init - A * B; Mat C_ref = C_init - A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -202,7 +228,7 @@ void test_gemm_subtract_empty(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = -(A * B); Mat C_ref = -(A * B);
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -226,7 +252,7 @@ void test_gemm_scaled_rhs(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A * (alpha * B); Mat C_ref = A * (alpha * B);
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -266,7 +292,7 @@ void test_gemm_explicit_context(Index m, Index n, Index k) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A * B; Mat C_ref = A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -296,7 +322,7 @@ void test_gemm_cross_context_reuse(Index n) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A * B + D * E; Mat C_ref = A * B + D * E;
RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm()) + gemm_error_bound<Scalar>(n, D.norm(), E.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -326,7 +352,7 @@ void test_gemm_cross_context_resize() {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = D * E; Mat C_ref = D * E;
RealScalar tol = RealScalar(16) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(16, D.norm(), E.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -353,7 +379,9 @@ void test_gemm_chain(Index n) {
Mat D = d_D.toHost(); Mat D = d_D.toHost();
Mat D_ref = (A * B) * E; Mat D_ref = (A * B) * E;
RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * D_ref.norm(); Mat C_ref = A * B;
RealScalar tol =
gemm_error_bound<Scalar>(n, A.norm(), B.norm()) * E.norm() + gemm_error_bound<Scalar>(n, C_ref.norm(), E.norm());
VERIFY((D - D_ref).norm() < tol); VERIFY((D - D_ref).norm() < tol);
} }
@@ -401,7 +429,7 @@ void test_llt_solve_expr(Index n, Index nrhs) {
Mat X = d_X.toHost(); Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm(); RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
} }
// ---- LLT solve with explicit context ---------------------------------------- // ---- LLT solve with explicit context ----------------------------------------
@@ -423,7 +451,7 @@ void test_llt_solve_expr_context(Index n, Index nrhs) {
Mat X = d_X.toHost(); Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm(); RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
} }
// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------ // ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------
@@ -444,7 +472,7 @@ void test_lu_solve_expr(Index n, Index nrhs) {
Mat X = d_X.toHost(); Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm()); RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon()); VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
} }
// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------ // ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------
@@ -474,7 +502,7 @@ void test_gemm_then_solve(Index n) {
Mat X = d_X.toHost(); Mat X = d_X.toHost();
RealScalar residual = (C * X - D).norm() / D.norm(); RealScalar residual = (C * X - D).norm() / D.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
} }
// ---- LLT solve with Upper triangle ----------------------------------------- // ---- LLT solve with Upper triangle -----------------------------------------
@@ -495,7 +523,7 @@ void test_llt_solve_upper(Index n, Index nrhs) {
Mat X = d_X.toHost(); Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm(); RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
} }
// ---- LU solve with explicit context ----------------------------------------- // ---- LU solve with explicit context -----------------------------------------
@@ -517,7 +545,7 @@ void test_lu_solve_expr_context(Index n, Index nrhs) {
Mat X = d_X.toHost(); Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm()); RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon()); VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
} }
// ---- Zero-nrhs solver expressions ------------------------------------------ // ---- Zero-nrhs solver expressions ------------------------------------------
@@ -581,7 +609,7 @@ void test_trsm(Index n, Index nrhs) {
Mat X = d_X.toHost(); Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm(); RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
} }
// ---- SYMM/HEMM: selfadjointView<UpLo>() * B -------------------------------- // ---- SYMM/HEMM: selfadjointView<UpLo>() * B --------------------------------
@@ -603,7 +631,7 @@ void test_symm(Index n, Index nrhs) {
Mat C = d_C.toHost(); Mat C = d_C.toHost();
Mat C_ref = A * B; // A is symmetric, so full multiply == symm Mat C_ref = A * B; // A is symmetric, so full multiply == symm
RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm());
VERIFY((C - C_ref).norm() < tol); VERIFY((C - C_ref).norm() < tol);
} }
@@ -629,7 +657,7 @@ void test_syrk(Index n, Index k) {
Mat C_lower = C.template triangularView<Lower>(); Mat C_lower = C.template triangularView<Lower>();
Mat C_ref_lower = C_ref.template triangularView<Lower>(); Mat C_ref_lower = C_ref.template triangularView<Lower>();
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), A.norm());
VERIFY((C_lower - C_ref_lower).norm() < tol); VERIFY((C_lower - C_ref_lower).norm() < tol);
} }

154
test/gpu_cudss_ldlt.cpp Normal file
View File

@@ -0,0 +1,154 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSparseLDLT: GPU sparse LDL^T via cuDSS.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a random sparse symmetric indefinite matrix ---------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_symmetric_indefinite(Index n, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
// Build a random sparse matrix and symmetrize it.
// The diagonal has mixed signs to ensure indefiniteness.
SpMat R(n, n);
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
for (Index j = 0; j < n; ++j) {
for (Index i = 0; i < n; ++i) {
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
}
}
}
R.makeCompressed();
// A = R + R^H (symmetric), then add diagonal with alternating signs for indefiniteness.
SpMat A = R + SparseMatrix<Scalar, ColMajor, int>(R.adjoint());
for (Index i = 0; i < n; ++i) {
Scalar diag_val = Scalar((i % 2 == 0) ? n : -n);
A.coeffRef(i, i) += diag_val;
}
A.makeCompressed();
return A;
}
// ---- Solve and check residual -----------------------------------------------
template <typename Scalar>
void test_solve(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_symmetric_indefinite<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLDLT<Scalar> ldlt(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
Vec x = ldlt.solve(b);
VERIFY_IS_EQUAL(x.rows(), n);
Vec r = A * x - b;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(r.norm() / b.norm() < tol);
}
// ---- Multiple RHS -----------------------------------------------------------
template <typename Scalar>
void test_multiple_rhs(Index n, Index nrhs) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_symmetric_indefinite<Scalar>(n);
Mat B = Mat::Random(n, nrhs);
GpuSparseLDLT<Scalar> ldlt(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
Mat X = ldlt.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
Mat R = A * X - B;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(R.norm() / B.norm() < tol);
}
// ---- Refactorize ------------------------------------------------------------
template <typename Scalar>
void test_refactorize(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_symmetric_indefinite<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLDLT<Scalar> ldlt;
ldlt.analyzePattern(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
ldlt.factorize(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
Vec x1 = ldlt.solve(b);
// Modify values, keep pattern.
SpMat A2 = A;
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
ldlt.factorize(A2);
VERIFY_IS_EQUAL(ldlt.info(), Success);
Vec x2 = ldlt.solve(b);
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((A * x1 - b).norm() / b.norm() < tol);
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
}
// ---- Empty ------------------------------------------------------------------
void test_empty() {
using SpMat = SparseMatrix<double, ColMajor, int>;
SpMat A(0, 0);
A.makeCompressed();
GpuSparseLDLT<double> ldlt(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
VERIFY_IS_EQUAL(ldlt.rows(), 0);
VERIFY_IS_EQUAL(ldlt.cols(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_solve<Scalar>(64));
CALL_SUBTEST(test_solve<Scalar>(256));
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
CALL_SUBTEST(test_refactorize<Scalar>(64));
}
EIGEN_DECLARE_TEST(gpu_cudss_ldlt) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_empty());
}

202
test/gpu_cudss_llt.cpp Normal file
View File

@@ -0,0 +1,202 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSparseLLT: GPU sparse Cholesky via cuDSS.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a random sparse SPD matrix -------------------------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using RealScalar = typename NumTraits<Scalar>::Real;
// Uses the global std::rand state seeded by the test framework (g_seed).
SpMat R(n, n);
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
for (Index j = 0; j < n; ++j) {
for (Index i = 0; i < n; ++i) {
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
}
}
}
R.makeCompressed();
// A = R^H * R + n * I (guaranteed SPD).
SpMat A = R.adjoint() * R;
for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
A.makeCompressed();
return A;
}
// ---- Solve and check residual -----------------------------------------------
template <typename Scalar>
void test_solve(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLLT<Scalar> llt(A);
VERIFY_IS_EQUAL(llt.info(), Success);
Vec x = llt.solve(b);
VERIFY_IS_EQUAL(x.rows(), n);
// Check residual: ||Ax - b|| / ||b||.
Vec r = A * x - b;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(r.norm() / b.norm() < tol);
}
// ---- Compare with CPU SimplicialLLT -----------------------------------------
template <typename Scalar>
void test_vs_cpu(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLLT<Scalar> gpu_llt(A);
VERIFY_IS_EQUAL(gpu_llt.info(), Success);
Vec x_gpu = gpu_llt.solve(b);
SimplicialLLT<SpMat> cpu_llt(A);
VERIFY_IS_EQUAL(cpu_llt.info(), Success);
Vec x_cpu = cpu_llt.solve(b);
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((x_gpu - x_cpu).norm() / x_cpu.norm() < tol);
}
// ---- Multiple RHS -----------------------------------------------------------
template <typename Scalar>
void test_multiple_rhs(Index n, Index nrhs) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Mat B = Mat::Random(n, nrhs);
GpuSparseLLT<Scalar> llt(A);
VERIFY_IS_EQUAL(llt.info(), Success);
Mat X = llt.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
Mat R = A * X - B;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(R.norm() / B.norm() < tol);
}
// ---- Separate analyze + factorize (refactorization) -------------------------
template <typename Scalar>
void test_refactorize(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLLT<Scalar> llt;
llt.analyzePattern(A);
VERIFY_IS_EQUAL(llt.info(), Success);
// First factorize + solve.
llt.factorize(A);
VERIFY_IS_EQUAL(llt.info(), Success);
Vec x1 = llt.solve(b);
// Modify values (keep same pattern): scale diagonal.
SpMat A2 = A;
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
// Refactorize with same pattern.
llt.factorize(A2);
VERIFY_IS_EQUAL(llt.info(), Success);
Vec x2 = llt.solve(b);
// Both solutions should satisfy their respective systems.
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((A * x1 - b).norm() / b.norm() < tol);
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
// Solutions should differ (A2 != A).
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
}
// ---- Empty matrix -----------------------------------------------------------
void test_empty() {
using SpMat = SparseMatrix<double, ColMajor, int>;
SpMat A(0, 0);
A.makeCompressed();
GpuSparseLLT<double> llt(A);
VERIFY_IS_EQUAL(llt.info(), Success);
VERIFY_IS_EQUAL(llt.rows(), 0);
VERIFY_IS_EQUAL(llt.cols(), 0);
}
// ---- Upper triangle ---------------------------------------------------------
template <typename Scalar>
void test_upper(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLLT<Scalar, Upper> llt(A);
VERIFY_IS_EQUAL(llt.info(), Success);
Vec x = llt.solve(b);
Vec r = A * x - b;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(r.norm() / b.norm() < tol);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_solve<Scalar>(64));
CALL_SUBTEST(test_solve<Scalar>(256));
CALL_SUBTEST(test_vs_cpu<Scalar>(64));
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
CALL_SUBTEST(test_refactorize<Scalar>(64));
CALL_SUBTEST(test_upper<Scalar>(64));
}
EIGEN_DECLARE_TEST(gpu_cudss_llt) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_empty());
}

147
test/gpu_cudss_lu.cpp Normal file
View File

@@ -0,0 +1,147 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSparseLU: GPU sparse LU via cuDSS.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a random sparse non-singular general matrix ---------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_general(Index n, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat R(n, n);
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
for (Index j = 0; j < n; ++j) {
for (Index i = 0; i < n; ++i) {
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
}
}
}
// Add strong diagonal for non-singularity.
for (Index i = 0; i < n; ++i) R.coeffRef(i, i) += Scalar(RealScalar(n));
R.makeCompressed();
return R;
}
// ---- Solve and check residual -----------------------------------------------
template <typename Scalar>
void test_solve(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_general<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLU<Scalar> lu(A);
VERIFY_IS_EQUAL(lu.info(), Success);
Vec x = lu.solve(b);
VERIFY_IS_EQUAL(x.rows(), n);
Vec r = A * x - b;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(r.norm() / b.norm() < tol);
}
// ---- Multiple RHS -----------------------------------------------------------
template <typename Scalar>
void test_multiple_rhs(Index n, Index nrhs) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_general<Scalar>(n);
Mat B = Mat::Random(n, nrhs);
GpuSparseLU<Scalar> lu(A);
VERIFY_IS_EQUAL(lu.info(), Success);
Mat X = lu.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
Mat R = A * X - B;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(R.norm() / B.norm() < tol);
}
// ---- Refactorize ------------------------------------------------------------
template <typename Scalar>
void test_refactorize(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_general<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLU<Scalar> lu;
lu.analyzePattern(A);
VERIFY_IS_EQUAL(lu.info(), Success);
lu.factorize(A);
VERIFY_IS_EQUAL(lu.info(), Success);
Vec x1 = lu.solve(b);
// Modify values, keep pattern.
SpMat A2 = A;
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
lu.factorize(A2);
VERIFY_IS_EQUAL(lu.info(), Success);
Vec x2 = lu.solve(b);
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((A * x1 - b).norm() / b.norm() < tol);
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
}
// ---- Empty ------------------------------------------------------------------
void test_empty() {
using SpMat = SparseMatrix<double, ColMajor, int>;
SpMat A(0, 0);
A.makeCompressed();
GpuSparseLU<double> lu(A);
VERIFY_IS_EQUAL(lu.info(), Success);
VERIFY_IS_EQUAL(lu.rows(), 0);
VERIFY_IS_EQUAL(lu.cols(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_solve<Scalar>(64));
CALL_SUBTEST(test_solve<Scalar>(256));
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
CALL_SUBTEST(test_refactorize<Scalar>(64));
}
EIGEN_DECLARE_TEST(gpu_cudss_lu) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_empty());
}

186
test/gpu_cufft.cpp Normal file
View File

@@ -0,0 +1,186 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuFFT: GPU FFT via cuFFT.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/GPU>
using namespace Eigen;
// ---- 1D C2C roundtrip: inv(fwd(x)) ≈ x -------------------------------------
template <typename Scalar>
void test_c2c_roundtrip(Index n) {
using Complex = std::complex<Scalar>;
using Vec = Matrix<Complex, Dynamic, 1>;
using RealScalar = Scalar;
Vec x = Vec::Random(n);
GpuFFT<Scalar> fft;
Vec X = fft.fwd(x);
VERIFY_IS_EQUAL(X.size(), n);
Vec y = fft.inv(X);
VERIFY_IS_EQUAL(y.size(), n);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y - x).norm() / x.norm() < tol);
}
// ---- 1D C2C known signal: FFT of constant = delta --------------------------
template <typename Scalar>
void test_c2c_constant() {
using Complex = std::complex<Scalar>;
using Vec = Matrix<Complex, Dynamic, 1>;
using RealScalar = Scalar;
const int n = 64;
Vec x = Vec::Constant(n, Complex(3.0, 0.0));
GpuFFT<Scalar> fft;
Vec X = fft.fwd(x);
// FFT of constant c: X[0] = c*n, X[k] = 0 for k > 0.
RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(n);
VERIFY(std::abs(X(0) - Complex(3.0 * n, 0.0)) < tol);
for (int k = 1; k < n; ++k) {
VERIFY(std::abs(X(k)) < tol);
}
}
// ---- 1D R2C/C2R roundtrip: invReal(fwd(r), n) ≈ r --------------------------
template <typename Scalar>
void test_r2c_roundtrip(Index n) {
using Complex = std::complex<Scalar>;
using CVec = Matrix<Complex, Dynamic, 1>;
using RVec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = Scalar;
RVec r = RVec::Random(n);
GpuFFT<Scalar> fft;
CVec R = fft.fwd(r);
// R2C returns n/2+1 complex values.
VERIFY_IS_EQUAL(R.size(), n / 2 + 1);
RVec s = fft.invReal(R, n);
VERIFY_IS_EQUAL(s.size(), n);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((s - r).norm() / r.norm() < tol);
}
// ---- 2D C2C roundtrip: inv2d(fwd2d(A)) ≈ A ---------------------------------
template <typename Scalar>
void test_2d_roundtrip(Index rows, Index cols) {
using Complex = std::complex<Scalar>;
using Mat = Matrix<Complex, Dynamic, Dynamic>;
using RealScalar = Scalar;
Mat A = Mat::Random(rows, cols);
GpuFFT<Scalar> fft;
Mat B = fft.fwd2d(A);
VERIFY_IS_EQUAL(B.rows(), rows);
VERIFY_IS_EQUAL(B.cols(), cols);
Mat C = fft.inv2d(B);
VERIFY_IS_EQUAL(C.rows(), rows);
VERIFY_IS_EQUAL(C.cols(), cols);
RealScalar tol = RealScalar(10) * RealScalar(rows * cols) * NumTraits<Scalar>::epsilon();
VERIFY((C - A).norm() / A.norm() < tol);
}
// ---- 2D C2C known signal: constant matrix -----------------------------------
template <typename Scalar>
void test_2d_constant() {
using Complex = std::complex<Scalar>;
using Mat = Matrix<Complex, Dynamic, Dynamic>;
using RealScalar = Scalar;
const int rows = 16, cols = 32;
Mat A = Mat::Constant(rows, cols, Complex(2.0, 0.0));
GpuFFT<Scalar> fft;
Mat B = fft.fwd2d(A);
// 2D FFT of constant c: B(0,0) = c*rows*cols, all others = 0.
RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(rows * cols);
VERIFY(std::abs(B(0, 0) - Complex(2.0 * rows * cols, 0.0)) < tol);
for (int j = 0; j < cols; ++j) {
for (int i = 0; i < rows; ++i) {
if (i == 0 && j == 0) continue;
VERIFY(std::abs(B(i, j)) < tol);
}
}
}
// ---- Plan reuse: repeated calls should work ---------------------------------
template <typename Scalar>
void test_plan_reuse() {
using Complex = std::complex<Scalar>;
using Vec = Matrix<Complex, Dynamic, 1>;
using RealScalar = Scalar;
GpuFFT<Scalar> fft;
for (int trial = 0; trial < 5; ++trial) {
Vec x = Vec::Random(128);
Vec X = fft.fwd(x);
Vec y = fft.inv(X);
RealScalar tol = RealScalar(10) * RealScalar(128) * NumTraits<Scalar>::epsilon();
VERIFY((y - x).norm() / x.norm() < tol);
}
}
// ---- Empty ------------------------------------------------------------------
template <typename Scalar>
void test_empty() {
using Complex = std::complex<Scalar>;
using Vec = Matrix<Complex, Dynamic, 1>;
GpuFFT<Scalar> fft;
Vec x(0);
Vec X = fft.fwd(x);
VERIFY_IS_EQUAL(X.size(), 0);
Vec y = fft.inv(X);
VERIFY_IS_EQUAL(y.size(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(64));
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(256));
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(1000)); // non-power-of-2
CALL_SUBTEST(test_c2c_constant<Scalar>());
CALL_SUBTEST(test_r2c_roundtrip<Scalar>(64));
CALL_SUBTEST(test_r2c_roundtrip<Scalar>(256));
CALL_SUBTEST(test_2d_roundtrip<Scalar>(32, 32));
CALL_SUBTEST(test_2d_roundtrip<Scalar>(16, 64)); // non-square
CALL_SUBTEST(test_2d_constant<Scalar>());
CALL_SUBTEST(test_plan_reuse<Scalar>());
CALL_SUBTEST(test_empty<Scalar>());
}
EIGEN_DECLARE_TEST(gpu_cufft) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
}

180
test/gpu_cusolver_eigen.cpp Normal file
View File

@@ -0,0 +1,180 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSelfAdjointEigenSolver: GPU symmetric/Hermitian eigenvalue
// decomposition via cuSOLVER.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Eigenvalues>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Reconstruction: V * diag(W) * V^H ≈ A ---------------------------------
template <typename Scalar>
void test_eigen_reconstruction(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
// Build a symmetric/Hermitian matrix.
Mat R = Mat::Random(n, n);
Mat A = R + R.adjoint();
GpuSelfAdjointEigenSolver<Scalar> es(A);
VERIFY_IS_EQUAL(es.info(), Success);
auto W = es.eigenvalues();
Mat V = es.eigenvectors();
VERIFY_IS_EQUAL(W.size(), n);
VERIFY_IS_EQUAL(V.rows(), n);
VERIFY_IS_EQUAL(V.cols(), n);
// Reconstruct: A_hat = V * diag(W) * V^H.
Mat A_hat = V * W.asDiagonal() * V.adjoint();
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
VERIFY((A_hat - A).norm() < tol);
// Orthogonality: V^H * V ≈ I.
Mat VhV = V.adjoint() * V;
Mat eye = Mat::Identity(n, n);
VERIFY((VhV - eye).norm() < tol);
}
// ---- Eigenvalues match CPU SelfAdjointEigenSolver ---------------------------
template <typename Scalar>
void test_eigen_values(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat R = Mat::Random(n, n);
Mat A = R + R.adjoint();
GpuSelfAdjointEigenSolver<Scalar> gpu_es(A);
VERIFY_IS_EQUAL(gpu_es.info(), Success);
auto W_gpu = gpu_es.eigenvalues();
SelfAdjointEigenSolver<Mat> cpu_es(A);
auto W_cpu = cpu_es.eigenvalues();
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() *
W_cpu.cwiseAbs().maxCoeff();
VERIFY((W_gpu - W_cpu).norm() < tol);
}
// ---- Eigenvalues-only mode --------------------------------------------------
template <typename Scalar>
void test_eigen_values_only(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat R = Mat::Random(n, n);
Mat A = R + R.adjoint();
GpuSelfAdjointEigenSolver<Scalar> gpu_es(A, GpuSelfAdjointEigenSolver<Scalar>::EigenvaluesOnly);
VERIFY_IS_EQUAL(gpu_es.info(), Success);
auto W_gpu = gpu_es.eigenvalues();
SelfAdjointEigenSolver<Mat> cpu_es(A, EigenvaluesOnly);
auto W_cpu = cpu_es.eigenvalues();
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() *
W_cpu.cwiseAbs().maxCoeff();
VERIFY((W_gpu - W_cpu).norm() < tol);
}
// ---- DeviceMatrix input path ------------------------------------------------
template <typename Scalar>
void test_eigen_device_matrix(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat R = Mat::Random(n, n);
Mat A = R + R.adjoint();
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
GpuSelfAdjointEigenSolver<Scalar> es;
es.compute(d_A);
VERIFY_IS_EQUAL(es.info(), Success);
auto W_gpu = es.eigenvalues();
Mat V = es.eigenvectors();
// Verify reconstruction.
Mat A_hat = V * W_gpu.asDiagonal() * V.adjoint();
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
VERIFY((A_hat - A).norm() < tol);
}
// ---- Recompute (reuse solver object) ----------------------------------------
template <typename Scalar>
void test_eigen_recompute(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
GpuSelfAdjointEigenSolver<Scalar> es;
for (int trial = 0; trial < 3; ++trial) {
Mat R = Mat::Random(n, n);
Mat A = R + R.adjoint();
es.compute(A);
VERIFY_IS_EQUAL(es.info(), Success);
auto W = es.eigenvalues();
Mat V = es.eigenvectors();
Mat A_hat = V * W.asDiagonal() * V.adjoint();
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
VERIFY((A_hat - A).norm() < tol);
}
}
// ---- Empty matrix -----------------------------------------------------------
void test_eigen_empty() {
GpuSelfAdjointEigenSolver<double> es(MatrixXd(0, 0));
VERIFY_IS_EQUAL(es.info(), Success);
VERIFY_IS_EQUAL(es.rows(), 0);
VERIFY_IS_EQUAL(es.cols(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
// Reconstruction + orthogonality.
CALL_SUBTEST(test_eigen_reconstruction<Scalar>(64));
CALL_SUBTEST(test_eigen_reconstruction<Scalar>(128));
// Eigenvalues match CPU.
CALL_SUBTEST(test_eigen_values<Scalar>(64));
CALL_SUBTEST(test_eigen_values<Scalar>(128));
// Values-only mode.
CALL_SUBTEST(test_eigen_values_only<Scalar>(64));
// DeviceMatrix input.
CALL_SUBTEST(test_eigen_device_matrix<Scalar>(64));
// Recompute.
CALL_SUBTEST(test_eigen_recompute<Scalar>(32));
}
EIGEN_DECLARE_TEST(gpu_cusolver_eigen) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_eigen_empty());
}

185
test/gpu_cusolver_qr.cpp Normal file
View File

@@ -0,0 +1,185 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuQR: GPU QR decomposition via cuSOLVER.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/QR>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Solve square system: A * X = B -----------------------------------------
template <typename Scalar>
void test_qr_solve_square(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
GpuQR<Scalar> qr(A);
VERIFY_IS_EQUAL(qr.info(), Success);
Mat X = qr.solve(B);
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
}
// ---- Solve overdetermined system: m > n (least-squares) ---------------------
template <typename Scalar>
void test_qr_solve_overdetermined(Index m, Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
eigen_assert(m >= n);
Mat A = Mat::Random(m, n);
Mat B = Mat::Random(m, nrhs);
GpuQR<Scalar> qr(A);
VERIFY_IS_EQUAL(qr.info(), Success);
Mat X = qr.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
// Compare with CPU QR.
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
RealScalar tol = RealScalar(100) * RealScalar(m) * NumTraits<Scalar>::epsilon();
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
}
// ---- Solve with DeviceMatrix input ------------------------------------------
template <typename Scalar>
void test_qr_solve_device(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
GpuQR<Scalar> qr;
qr.compute(d_A);
VERIFY_IS_EQUAL(qr.info(), Success);
DeviceMatrix<Scalar> d_X = qr.solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
}
// ---- Solve overdetermined via device path -----------------------------------
template <typename Scalar>
void test_qr_solve_overdetermined_device(Index m, Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
eigen_assert(m >= n);
Mat A = Mat::Random(m, n);
Mat B = Mat::Random(m, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
GpuQR<Scalar> qr;
qr.compute(d_A);
VERIFY_IS_EQUAL(qr.info(), Success);
DeviceMatrix<Scalar> d_X = qr.solve(d_B);
VERIFY_IS_EQUAL(d_X.rows(), n);
VERIFY_IS_EQUAL(d_X.cols(), nrhs);
Mat X = d_X.toHost();
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
RealScalar tol = RealScalar(100) * RealScalar(m) * NumTraits<Scalar>::epsilon();
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
}
// ---- Multiple solves reuse the factorization --------------------------------
template <typename Scalar>
void test_qr_multiple_solves(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
GpuQR<Scalar> qr(A);
VERIFY_IS_EQUAL(qr.info(), Success);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
for (int k = 0; k < 5; ++k) {
Mat B = Mat::Random(n, 3);
Mat X = qr.solve(B);
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < tol);
}
}
// ---- Agreement with CPU HouseholderQR ---------------------------------------
template <typename Scalar>
void test_qr_vs_cpu(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
GpuQR<Scalar> gpu_qr(A);
VERIFY_IS_EQUAL(gpu_qr.info(), Success);
Mat X_gpu = gpu_qr.solve(B);
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_qr_solve_square<Scalar>(1, 1));
CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 1));
CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 4));
CALL_SUBTEST(test_qr_solve_square<Scalar>(256, 8));
CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(128, 64, 4));
CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(256, 128, 1));
CALL_SUBTEST(test_qr_solve_device<Scalar>(64, 4));
CALL_SUBTEST(test_qr_solve_overdetermined_device<Scalar>(128, 64, 4));
CALL_SUBTEST(test_qr_multiple_solves<Scalar>(64));
CALL_SUBTEST(test_qr_vs_cpu<Scalar>(64, 4));
CALL_SUBTEST(test_qr_vs_cpu<Scalar>(256, 8));
}
void test_qr_empty() {
GpuQR<double> qr(MatrixXd(0, 0));
VERIFY_IS_EQUAL(qr.info(), Success);
VERIFY_IS_EQUAL(qr.rows(), 0);
VERIFY_IS_EQUAL(qr.cols(), 0);
}
EIGEN_DECLARE_TEST(gpu_cusolver_qr) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_qr_empty());
}

194
test/gpu_cusolver_svd.cpp Normal file
View File

@@ -0,0 +1,194 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSVD: GPU SVD via cuSOLVER.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/SVD>
#include <Eigen/GPU>
using namespace Eigen;
// ---- SVD reconstruction: U * diag(S) * VT ≈ A ------------------------------
template <typename Scalar, unsigned int Options>
void test_svd_reconstruction(Index m, Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, n);
GpuSVD<Scalar> svd(A, Options);
VERIFY_IS_EQUAL(svd.info(), Success);
auto S = svd.singularValues();
Mat U = svd.matrixU();
Mat VT = svd.matrixVT();
const Index k = (std::min)(m, n);
// Reconstruct: A_hat = U[:,:k] * diag(S) * VT[:k,:].
Mat A_hat = U.leftCols(k) * S.asDiagonal() * VT.topRows(k);
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(k)) * NumTraits<Scalar>::epsilon() * A.norm();
VERIFY((A_hat - A).norm() < tol);
// Orthogonality: U^H * U ≈ I.
Mat UtU = U.adjoint() * U;
Mat I_u = Mat::Identity(U.cols(), U.cols());
VERIFY((UtU - I_u).norm() < tol);
// Orthogonality: VT * VT^H ≈ I.
Mat VtVh = VT * VT.adjoint();
Mat I_v = Mat::Identity(VT.rows(), VT.rows());
VERIFY((VtVh - I_v).norm() < tol);
}
// ---- Singular values match CPU BDCSVD ---------------------------------------
template <typename Scalar>
void test_svd_singular_values(Index m, Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, n);
GpuSVD<Scalar> svd(A, 0); // values only
VERIFY_IS_EQUAL(svd.info(), Success);
auto S_gpu = svd.singularValues();
auto S_cpu = BDCSVD<Mat>(A, 0).singularValues();
RealScalar tol =
RealScalar(5) * std::sqrt(static_cast<RealScalar>((std::min)(m, n))) * NumTraits<Scalar>::epsilon() * S_cpu(0);
VERIFY((S_gpu - S_cpu).norm() < tol);
}
// ---- Solve: pseudoinverse ---------------------------------------------------
template <typename Scalar>
void test_svd_solve(Index m, Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, n);
Mat B = Mat::Random(m, nrhs);
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
VERIFY_IS_EQUAL(svd.info(), Success);
Mat X = svd.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
// Compare with CPU BDCSVD solve.
Mat X_cpu = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV).solve(B);
RealScalar tol = RealScalar(100) * RealScalar((std::max)(m, n)) * NumTraits<Scalar>::epsilon();
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
}
// ---- Solve: truncated -------------------------------------------------------
template <typename Scalar>
void test_svd_solve_truncated(Index m, Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, n);
Mat B = Mat::Random(m, 1);
const Index k = (std::min)(m, n);
const Index trunc = k / 2;
eigen_assert(trunc > 0);
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
Mat X_trunc = svd.solve(B, trunc);
// Build CPU reference: truncated pseudoinverse.
auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
auto S = cpu_svd.singularValues();
Mat U = cpu_svd.matrixU();
Mat V = cpu_svd.matrixV();
// D_ii = 1/S_i for i < trunc, 0 otherwise.
Matrix<RealScalar, Dynamic, 1> D = Matrix<RealScalar, Dynamic, 1>::Zero(k);
for (Index i = 0; i < trunc; ++i) D(i) = RealScalar(1) / S(i);
Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
VERIFY((X_trunc - X_ref).norm() / X_ref.norm() < tol);
}
// ---- Solve: Tikhonov regularized --------------------------------------------
template <typename Scalar>
void test_svd_solve_regularized(Index m, Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, n);
Mat B = Mat::Random(m, 1);
RealScalar lambda = RealScalar(0.1);
const Index k = (std::min)(m, n);
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
Mat X_reg = svd.solve(B, lambda);
// CPU reference: D_ii = S_i / (S_i^2 + lambda^2).
auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
auto S = cpu_svd.singularValues();
Mat U = cpu_svd.matrixU();
Mat V = cpu_svd.matrixV();
Matrix<RealScalar, Dynamic, 1> D(k);
for (Index i = 0; i < k; ++i) D(i) = S(i) / (S(i) * S(i) + lambda * lambda);
Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
VERIFY((X_reg - X_ref).norm() / X_ref.norm() < tol);
}
// ---- Empty matrix -----------------------------------------------------------
void test_svd_empty() {
GpuSVD<double> svd(MatrixXd(0, 0), 0);
VERIFY_IS_EQUAL(svd.info(), Success);
VERIFY_IS_EQUAL(svd.rows(), 0);
VERIFY_IS_EQUAL(svd.cols(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
// Reconstruction + orthogonality (thin and full, identical test logic).
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 64)));
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(128, 64)));
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 128))); // wide (m < n)
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(64, 64)));
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(128, 64)));
// Singular values.
CALL_SUBTEST(test_svd_singular_values<Scalar>(64, 64));
CALL_SUBTEST(test_svd_singular_values<Scalar>(128, 64));
// Solve.
CALL_SUBTEST(test_svd_solve<Scalar>(64, 64, 4));
CALL_SUBTEST(test_svd_solve<Scalar>(128, 64, 4));
CALL_SUBTEST(test_svd_solve<Scalar>(64, 128, 4)); // wide (m < n)
// Truncated and regularized solve.
CALL_SUBTEST(test_svd_solve_truncated<Scalar>(64, 64));
CALL_SUBTEST(test_svd_solve_regularized<Scalar>(64, 64));
}
EIGEN_DECLARE_TEST(gpu_cusolver_svd) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_svd_empty());
}

305
test/gpu_cusparse_spmv.cpp Normal file
View File

@@ -0,0 +1,305 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSparseContext: GPU SpMV/SpMM via cuSPARSE.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a random sparse matrix -----------------------------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_sparse(Index rows, Index cols, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat R(rows, cols);
R.reserve(VectorXi::Constant(cols, static_cast<int>(rows * density) + 1));
for (Index j = 0; j < cols; ++j) {
for (Index i = 0; i < rows; ++i) {
if ((std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(RealScalar(std::rand() / double(RAND_MAX) - 0.5));
}
}
}
R.makeCompressed();
return R;
}
// ---- SpMV: y = A * x -------------------------------------------------------
template <typename Scalar>
void test_spmv(Index rows, Index cols) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(rows, cols);
Vec x = Vec::Random(cols);
GpuSparseContext<Scalar> ctx;
Vec y_gpu = ctx.multiply(A, x);
Vec y_cpu = A * x;
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
VERIFY_IS_EQUAL(y_gpu.size(), rows);
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- SpMV with alpha/beta: y = alpha*A*x + beta*y ---------------------------
template <typename Scalar>
void test_spmv_alpha_beta(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(n, n);
Vec x = Vec::Random(n);
Vec y_init = Vec::Random(n);
Scalar alpha(2);
Scalar beta(3);
Vec y_cpu = alpha * (A * x) + beta * y_init;
GpuSparseContext<Scalar> ctx;
Vec y_gpu = y_init;
ctx.multiply(A, x, y_gpu, alpha, beta);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- Transpose: y = A^T * x ------------------------------------------------
template <typename Scalar>
void test_spmv_transpose(Index rows, Index cols) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(rows, cols);
Vec x = Vec::Random(rows);
GpuSparseContext<Scalar> ctx;
Vec y_gpu = ctx.multiplyT(A, x);
Vec y_cpu = A.transpose() * x;
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
VERIFY_IS_EQUAL(y_gpu.size(), cols);
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- SpMM: Y = A * X (multiple RHS) ----------------------------------------
template <typename Scalar>
void test_spmm(Index rows, Index cols, Index nrhs) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(rows, cols);
Mat X = Mat::Random(cols, nrhs);
GpuSparseContext<Scalar> ctx;
Mat Y_gpu = ctx.multiplyMat(A, X);
Mat Y_cpu = A * X;
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
VERIFY_IS_EQUAL(Y_gpu.rows(), rows);
VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);
VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- Identity matrix: I * x = x --------------------------------------------
template <typename Scalar>
void test_identity(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
// Build sparse identity.
SpMat eye(n, n);
eye.setIdentity();
eye.makeCompressed();
Vec x = Vec::Random(n);
GpuSparseContext<Scalar> ctx;
Vec y = ctx.multiply(eye, x);
RealScalar tol = NumTraits<Scalar>::epsilon();
VERIFY((y - x).norm() < tol);
}
// ---- Context reuse ----------------------------------------------------------
template <typename Scalar>
void test_reuse(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
GpuSparseContext<Scalar> ctx;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
for (int trial = 0; trial < 3; ++trial) {
SpMat A = make_sparse<Scalar>(n, n);
Vec x = Vec::Random(n);
Vec y_gpu = ctx.multiply(A, x);
Vec y_cpu = A * x;
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
}
// ---- Empty ------------------------------------------------------------------
template <typename Scalar>
void test_empty() {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
SpMat A(0, 0);
A.makeCompressed();
Vec x(0);
GpuSparseContext<Scalar> ctx;
Vec y = ctx.multiply(A, x);
VERIFY_IS_EQUAL(y.size(), 0);
}
// ---- DeviceMatrix SpMV (no host roundtrip) ----------------------------------
template <typename Scalar>
void test_spmv_device(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(n, n);
Vec x = Vec::Random(n);
// Use shared GpuContext for same-stream execution.
GpuContext gpu_ctx;
GpuSparseContext<Scalar> ctx(gpu_ctx);
auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
DeviceMatrix<Scalar> d_y;
ctx.multiply(A, d_x, d_y);
Vec y_gpu = d_y.toHost(gpu_ctx.stream());
Vec y_cpu = A * x;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- Expression syntax: d_y = d_A * d_x ------------------------------------
template <typename Scalar>
void test_spmv_expr(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(n, n);
Vec x = Vec::Random(n);
GpuContext gpu_ctx;
GpuSparseContext<Scalar> ctx(gpu_ctx);
// Upload sparse matrix and create device view.
auto d_A = ctx.deviceView(A);
// Upload x.
auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
// Expression syntax: d_y = d_A * d_x
DeviceMatrix<Scalar> d_y;
d_y = d_A * d_x;
// Also test with noalias():
DeviceMatrix<Scalar> d_tmp;
d_tmp.noalias() = d_A * d_x;
Vec y_gpu = d_y.toHost(gpu_ctx.stream());
Vec tmp_gpu = d_tmp.toHost(gpu_ctx.stream());
Vec y_cpu = A * x;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
VERIFY((tmp_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- deviceView overwrite: second view replaces first -----------------------
template <typename Scalar>
void test_deviceview_overwrite(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A1 = make_sparse<Scalar>(n, n);
SpMat A2 = make_sparse<Scalar>(n, n); // different random matrix
Vec x = Vec::Random(n);
GpuContext gpu_ctx;
GpuSparseContext<Scalar> ctx(gpu_ctx);
// First view: A1.
auto d_A1 = ctx.deviceView(A1);
auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
DeviceMatrix<Scalar> d_y1;
d_y1 = d_A1 * d_x;
Vec y1_gpu = d_y1.toHost(gpu_ctx.stream());
Vec y1_cpu = A1 * x;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y1_gpu - y1_cpu).norm() / (y1_cpu.norm() + RealScalar(1)) < tol);
// Second view overwrites first: now uses A2.
auto d_A2 = ctx.deviceView(A2);
DeviceMatrix<Scalar> d_y2;
d_y2 = d_A2 * d_x;
Vec y2_gpu = d_y2.toHost(gpu_ctx.stream());
Vec y2_cpu = A2 * x;
VERIFY((y2_gpu - y2_cpu).norm() / (y2_cpu.norm() + RealScalar(1)) < tol);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_spmv<Scalar>(64, 64));
CALL_SUBTEST(test_spmv<Scalar>(128, 64)); // non-square
CALL_SUBTEST(test_spmv<Scalar>(64, 128)); // wide
CALL_SUBTEST(test_spmv_alpha_beta<Scalar>(64));
CALL_SUBTEST(test_spmv_transpose<Scalar>(128, 64));
CALL_SUBTEST(test_spmm<Scalar>(64, 64, 4));
CALL_SUBTEST(test_identity<Scalar>(64));
CALL_SUBTEST(test_reuse<Scalar>(64));
CALL_SUBTEST(test_empty<Scalar>());
CALL_SUBTEST(test_spmv_device<Scalar>(64));
CALL_SUBTEST(test_spmv_expr<Scalar>(64));
CALL_SUBTEST(test_deviceview_overwrite<Scalar>(64));
}
EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
}

View File

@@ -12,6 +12,7 @@
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "main.h" #include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU> #include <Eigen/GPU>
using namespace Eigen; using namespace Eigen;
@@ -35,7 +36,6 @@ void test_allocate(Index rows, Index cols) {
VERIFY(!dm.empty()); VERIFY(!dm.empty());
VERIFY_IS_EQUAL(dm.rows(), rows); VERIFY_IS_EQUAL(dm.rows(), rows);
VERIFY_IS_EQUAL(dm.cols(), cols); VERIFY_IS_EQUAL(dm.cols(), cols);
VERIFY_IS_EQUAL(dm.outerStride(), rows);
VERIFY(dm.data() != nullptr); VERIFY(dm.data() != nullptr);
VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar)); VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar));
} }
@@ -69,7 +69,7 @@ void test_roundtrip_async(Index rows, Index cols) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream)); EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream));
// Async upload from raw pointer. // Async upload from raw pointer.
auto dm = DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, stream); auto dm = DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, stream);
VERIFY_IS_EQUAL(dm.rows(), rows); VERIFY_IS_EQUAL(dm.rows(), rows);
VERIFY_IS_EQUAL(dm.cols(), cols); VERIFY_IS_EQUAL(dm.cols(), cols);
@@ -185,7 +185,6 @@ void test_resize() {
dm.resize(50, 30); dm.resize(50, 30);
VERIFY_IS_EQUAL(dm.rows(), 50); VERIFY_IS_EQUAL(dm.rows(), 50);
VERIFY_IS_EQUAL(dm.cols(), 30); VERIFY_IS_EQUAL(dm.cols(), 30);
VERIFY_IS_EQUAL(dm.outerStride(), 50);
VERIFY(dm.data() != nullptr); VERIFY(dm.data() != nullptr);
// Resize to same dimensions is a no-op. // Resize to same dimensions is a no-op.
@@ -232,6 +231,217 @@ void test_scalar() {
CALL_SUBTEST(test_move_assign<Scalar>(64, 64)); CALL_SUBTEST(test_move_assign<Scalar>(64, 64));
} }
// ---- BLAS-1: dot product ----------------------------------------------------
template <typename Scalar>
void test_blas1(Index n) {
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
// All BLAS-1 ops share one GpuContext — same stream, zero event overhead.
GpuContext ctx;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
// dot
{
Vec a = Vec::Random(n);
Vec b = Vec::Random(n);
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
Scalar gpu_dot = d_a.dot(ctx, d_b);
Scalar cpu_dot = a.dot(b);
VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
}
// norm / squaredNorm
{
Vec a = Vec::Random(n);
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
RealScalar gpu_norm = d_a.norm(ctx);
RealScalar cpu_norm = a.norm();
VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol);
RealScalar gpu_sqnorm = d_a.squaredNorm(ctx);
RealScalar cpu_sqnorm = a.squaredNorm();
VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol);
}
// addScaled (axpy)
{
Vec x = Vec::Random(n);
Vec y = Vec::Random(n);
Scalar alpha(2.5);
Vec y_ref = y + alpha * x;
auto d_y = DeviceMatrix<Scalar>::fromHost(y, ctx.stream());
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
d_y.addScaled(ctx, alpha, d_x);
Vec y_gpu = d_y.toHost(ctx.stream());
VERIFY((y_gpu - y_ref).norm() < tol * y_ref.norm() + tol);
}
// scale (scal)
{
Vec x = Vec::Random(n);
Scalar alpha(3.0);
Vec x_ref = alpha * x;
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
d_x.scale(ctx, alpha);
Vec x_gpu = d_x.toHost(ctx.stream());
VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol);
}
// copyFrom
{
Vec x = Vec::Random(n);
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
DeviceMatrix<Scalar> d_y;
d_y.copyFrom(ctx, d_x);
Vec y = d_y.toHost(ctx.stream());
VERIFY_IS_APPROX(y, x);
}
// setZero
{
Vec x = Vec::Random(n);
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
d_x.setZero(ctx);
Vec result = d_x.toHost(ctx.stream());
VERIFY_IS_EQUAL(result, Vec::Zero(n));
}
}
// ---- BLAS-1 operator overloads (CG-style) -----------------------------------
template <typename Scalar>
void test_cg_operators(Index n) {
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
Vec x = Vec::Random(n);
Vec p = Vec::Random(n);
Vec tmp = Vec::Random(n);
Vec z = Vec::Random(n);
Scalar alpha(2.5);
Scalar beta(0.7);
// Test: x += alpha * p
{
Vec x_ref = x + alpha * p;
auto d_x = DeviceMatrix<Scalar>::fromHost(x);
auto d_p = DeviceMatrix<Scalar>::fromHost(p);
d_x += alpha * d_p;
Vec x_gpu = d_x.toHost();
VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol);
}
// Test: r -= alpha * tmp
{
Vec r = Vec::Random(n);
Vec r_ref = r - alpha * tmp;
auto d_r = DeviceMatrix<Scalar>::fromHost(r);
auto d_tmp = DeviceMatrix<Scalar>::fromHost(tmp);
d_r -= alpha * d_tmp;
Vec r_gpu = d_r.toHost();
VERIFY((r_gpu - r_ref).norm() < tol * r_ref.norm() + tol);
}
// Test: p = z + beta * p (cuBLAS geam)
{
Vec p_copy = p;
Vec p_ref = z + beta * p_copy;
auto d_p = DeviceMatrix<Scalar>::fromHost(p_copy);
auto d_z = DeviceMatrix<Scalar>::fromHost(z);
d_p = d_z + beta * d_p;
Vec p_gpu = d_p.toHost();
VERIFY((p_gpu - p_ref).norm() < tol * p_ref.norm() + tol);
}
// Test: operator+= and operator-= with DeviceMatrix (no scalar)
{
Vec a = Vec::Random(n);
Vec b = Vec::Random(n);
Vec a_ref = a + b;
auto d_a = DeviceMatrix<Scalar>::fromHost(a);
auto d_b = DeviceMatrix<Scalar>::fromHost(b);
d_a += d_b;
VERIFY((d_a.toHost() - a_ref).norm() < tol * a_ref.norm() + tol);
}
}
// ---- DeviceScalar: deferred sync -------------------------------------------
template <typename Scalar>
void test_device_scalar() {
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
const Index n = 256;
Vec a = Vec::Random(n);
Vec b = Vec::Random(n);
GpuContext ctx;
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
// dot() returns DeviceScalar — implicit conversion to Scalar syncs.
Scalar gpu_dot = d_a.dot(ctx, d_b);
Scalar cpu_dot = a.dot(b);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
// squaredNorm() returns host RealScalar directly (syncs internally).
RealScalar gpu_sqnorm = d_a.squaredNorm(ctx);
RealScalar cpu_sqnorm = a.squaredNorm();
VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol);
// norm() returns DeviceScalar<RealScalar> — implicit conversion syncs.
RealScalar gpu_norm = d_a.norm(ctx);
RealScalar cpu_norm = a.norm();
VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol);
// Convenience overloads (thread-local context).
GpuContext::setThreadLocal(&ctx);
Scalar gpu_dot2 = d_a.dot(d_b);
VERIFY(numext::abs(gpu_dot2 - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
GpuContext::setThreadLocal(nullptr);
// Empty vectors: dot and norm must return zero.
{
DeviceMatrix<Scalar> d_empty(0, 1);
DeviceMatrix<Scalar> d_empty2(0, 1);
Scalar empty_dot = d_empty.dot(ctx, d_empty2);
VERIFY_IS_EQUAL(empty_dot, Scalar(0));
RealScalar empty_sqnorm = d_empty.squaredNorm(ctx);
VERIFY_IS_EQUAL(empty_sqnorm, RealScalar(0));
RealScalar empty_norm = d_empty.norm(ctx);
VERIFY_IS_EQUAL(empty_norm, RealScalar(0));
}
}
// ---- cwiseProduct -----------------------------------------------------------
template <typename Scalar>
void test_cwiseProduct() {
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
const Index n = 256;
Vec a = Vec::Random(n);
Vec b = Vec::Random(n);
Vec ref = a.array() * b.array();
GpuContext ctx;
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
auto d_c = d_a.cwiseProduct(ctx, d_b);
Vec result = d_c.toHost(ctx.stream());
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((result - ref).norm() < tol * ref.norm() + tol);
}
EIGEN_DECLARE_TEST(gpu_device_matrix) { EIGEN_DECLARE_TEST(gpu_device_matrix) {
CALL_SUBTEST(test_default_construct()); CALL_SUBTEST(test_default_construct());
CALL_SUBTEST(test_empty()); CALL_SUBTEST(test_empty());
@@ -244,4 +454,18 @@ EIGEN_DECLARE_TEST(gpu_device_matrix) {
CALL_SUBTEST(test_scalar<double>()); CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>()); CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>()); CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_blas1<float>(256));
CALL_SUBTEST(test_blas1<double>(256));
CALL_SUBTEST(test_blas1<std::complex<float>>(256));
CALL_SUBTEST(test_blas1<std::complex<double>>(256));
CALL_SUBTEST(test_cg_operators<float>(256));
CALL_SUBTEST(test_cg_operators<double>(256));
CALL_SUBTEST(test_cg_operators<std::complex<float>>(256));
CALL_SUBTEST(test_cg_operators<std::complex<double>>(256));
CALL_SUBTEST(test_device_scalar<float>());
CALL_SUBTEST(test_device_scalar<double>());
CALL_SUBTEST(test_device_scalar<std::complex<float>>());
CALL_SUBTEST(test_device_scalar<std::complex<double>>());
CALL_SUBTEST(test_cwiseProduct<float>());
CALL_SUBTEST(test_cwiseProduct<double>());
} }