Compare commits

..

2 Commits

Author SHA1 Message Date
Rasmus Munk Larsen
014f12f11a GPU: Add BLAS-1 ops, DeviceScalar, device-resident SpMV, and CG interop (5/5)
Add the operator interface needed for GPU iterative solvers:

- BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(),
  noalias(), operator+=/-=/\*= dispatching to cuBLAS axpy/scal/dot/nrm2.
- DeviceScalar<Scalar>: device-resident scalar returned by reductions.
  Defers host sync until value is read (implicit conversion). Device-side
  division via NPP for real types.
- GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(),
  cusparseHandle().
- GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm
  selection and plan caching.
- GpuSparseContext: GpuContext& constructor for same-stream execution,
  deviceView() returning DeviceSparseView with operator* for device-resident
  SpMV (d_y = d_A * d_x).
- geam expressions: d_C = d_A + alpha * d_B via cublasXgeam.
- GpuSVD::matrixV() convenience wrapper.

These additions make DeviceMatrix usable as a VectorType in Eigen algorithm
templates. Conjugate gradient is the motivating example and is tested against
CPU ConjugateGradient for correctness.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 20:19:59 -07:00
Rasmus Munk Larsen
43a95b62bb GPU: Add sparse solvers, FFT, and SpMV (cuDSS, cuFFT, cuSPARSE)
Add GPU sparse direct solvers (Cholesky, LDL^T, LU) via cuDSS, 1D/2D FFT
via cuFFT with plan caching, and sparse matrix-vector/matrix multiply
(SpMV/SpMM) via cuSPARSE.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 19:11:49 -07:00
37 changed files with 6255 additions and 199 deletions

View File

@@ -39,6 +39,7 @@
#ifdef EIGEN_USE_GPU
// IWYU pragma: begin_exports
#include "src/GPU/DeviceScalar.h"
#include "src/GPU/DeviceMatrix.h"
#include "src/GPU/GpuContext.h"
#include "src/GPU/DeviceExpr.h"
@@ -50,6 +51,19 @@
#include "src/GPU/GpuQR.h"
#include "src/GPU/GpuSVD.h"
#include "src/GPU/GpuEigenSolver.h"
#include "src/GPU/CuFftSupport.h"
#include "src/GPU/GpuFFT.h"
#include "src/GPU/CuSparseSupport.h"
#ifdef EIGEN_SPARSECORE_MODULE_H
#include "src/GPU/GpuSparseContext.h"
#endif
#if defined(EIGEN_CUDSS) && defined(EIGEN_SPARSECORE_MODULE_H)
#include "src/GPU/CuDssSupport.h"
#include "src/GPU/GpuSparseSolverBase.h"
#include "src/GPU/GpuSparseLLT.h"
#include "src/GPU/GpuSparseLDLT.h"
#include "src/GPU/GpuSparseLU.h"
#endif
// IWYU pragma: end_exports
#endif

View File

@@ -21,6 +21,7 @@
#include "./GpuSupport.h"
#include <cublas_v2.h>
#include <cublasLt.h>
namespace Eigen {
namespace internal {
@@ -50,14 +51,14 @@ constexpr cublasOperation_t to_cublas_op(GpuOp op) {
}
// ---- Scalar → cublasComputeType_t -------------------------------------------
// cublasGemmEx requires a compute type (separate from the data type).
// cublasLtMatmul requires a compute type (separate from the data type).
//
// Precision policy:
// - Default: tensor core algorithms enabled (CUBLAS_GEMM_DEFAULT_TENSOR_OP).
// - Default: tensor core algorithms enabled via cublasLtMatmul heuristics.
// For double, cuBLAS may use Ozaki emulation on sm_80+ tensor cores.
// - EIGEN_CUDA_TF32: opt-in to TF32 for float (~2x faster, 10-bit mantissa).
// - EIGEN_NO_CUDA_TENSOR_OPS: disables all tensor core usage. Uses pedantic
// compute types and CUBLAS_GEMM_DEFAULT algorithm. For bit-exact reproducibility.
// compute types. For bit-exact reproducibility.
template <typename Scalar>
struct cuda_compute_type;
@@ -98,8 +99,47 @@ struct cuda_compute_type<std::complex<double>> {
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
#endif
};
// ---- GEMM algorithm hint ----------------------------------------------------
// ---- Alpha/beta scalar type for cublasLtMatmul ------------------------------
// For standard types, alpha/beta match the scalar type.
template <typename Scalar>
struct cuda_gemm_scalar {
using type = Scalar;
};
// ---- cublasLt GEMM dispatch -------------------------------------------------
// Wraps cublasLtMatmul with descriptor setup, heuristic algorithm selection,
// and lazy workspace management. Supports 64-bit dimensions natively.
//
// The workspace buffer (DeviceBuffer*) is grown lazily to match the selected
// algorithm's actual requirement. The heuristic is queried with a generous
// 32 MB cap so that the best algorithm is never excluded. Growth is monotonic:
// the buffer only grows, never shrinks, so reallocation happens at most a few
// times during the lifetime of the owning GpuContext or solver.
//
// EIGEN_NO_CUDA_TENSOR_OPS: pedantic compute types (CUBLAS_COMPUTE_32F_PEDANTIC,
// CUBLAS_COMPUTE_64F_PEDANTIC) prevent cublasLt from selecting tensor core
// algorithms, matching the previous cublasGemmEx behavior.
//
// Thread safety: the workspace buffer is not thread-safe. All GEMM calls
// sharing a workspace must be on the same CUDA stream (guaranteed by GpuContext's
// single-stream design and by each GpuSVD owning its own stream).
//
// Future optimization: for hot loops (e.g., CG iteration), caching descriptors
// and the selected algorithm by (m, n, k, dtype, transA, transB) would avoid
// per-call descriptor creation and heuristic lookup overhead.
#define EIGEN_CUBLASLT_CHECK(expr) \
do { \
cublasStatus_t _s = (expr); \
eigen_assert(_s == CUBLAS_STATUS_SUCCESS && "cuBLASLt call failed"); \
} while (0)
// Maximum workspace the heuristic is allowed to consider. This is a preference
// ceiling, not an allocation — actual allocation matches the selected algorithm.
static constexpr size_t kCublasLtMaxWorkspaceBytes = 32 * 1024 * 1024; // 32 MB
// cublasGemmEx fallback algorithm hint (used when cublasLt heuristic returns no results).
constexpr cublasGemmAlgo_t cuda_gemm_algo() {
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
return CUBLAS_GEMM_DEFAULT;
@@ -108,13 +148,73 @@ constexpr cublasGemmAlgo_t cuda_gemm_algo() {
#endif
}
// ---- Alpha/beta scalar type for cublasGemmEx --------------------------------
// For standard types, alpha/beta match the scalar type.
template <typename Scalar>
struct cuda_gemm_scalar {
using type = Scalar;
};
void cublaslt_gemm(cublasLtHandle_t lt_handle, cublasHandle_t cublas_handle, cublasOperation_t transA,
cublasOperation_t transB, int64_t m, int64_t n, int64_t k,
const typename cuda_gemm_scalar<Scalar>::type* alpha, const Scalar* A, int64_t lda, const Scalar* B,
int64_t ldb, const typename cuda_gemm_scalar<Scalar>::type* beta, Scalar* C, int64_t ldc,
DeviceBuffer* workspace, cudaStream_t stream) {
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
using AlphaType = typename cuda_gemm_scalar<Scalar>::type;
constexpr cudaDataType_t alpha_type = cuda_data_type<AlphaType>::value;
// Matmul descriptor.
cublasLtMatmulDesc_t matmul_desc = nullptr;
EIGEN_CUBLASLT_CHECK(cublasLtMatmulDescCreate(&matmul_desc, compute, alpha_type));
EIGEN_CUBLASLT_CHECK(
cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, sizeof(transA)));
EIGEN_CUBLASLT_CHECK(
cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, sizeof(transB)));
// Matrix layout descriptors (column-major).
// Physical layout dimensions: rows × cols with leading dimension lda/ldb/ldc.
const int64_t a_rows = (transA == CUBLAS_OP_N) ? m : k;
const int64_t b_rows = (transB == CUBLAS_OP_N) ? k : n;
cublasLtMatrixLayout_t layout_A = nullptr, layout_B = nullptr, layout_C = nullptr;
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_A, dtype, a_rows, (transA == CUBLAS_OP_N) ? k : m, lda));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_B, dtype, b_rows, (transB == CUBLAS_OP_N) ? n : k, ldb));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_C, dtype, m, n, ldc));
// Heuristic selection: query with generous workspace cap, allocate only what's needed.
cublasLtMatmulPreference_t preference = nullptr;
EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceCreate(&preference));
size_t max_ws = kCublasLtMaxWorkspaceBytes;
EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&max_ws, sizeof(max_ws)));
cublasLtMatmulHeuristicResult_t result;
int returned_results = 0;
cublasStatus_t heuristic_status = cublasLtMatmulAlgoGetHeuristic(lt_handle, matmul_desc, layout_A, layout_B, layout_C,
layout_C, preference, 1, &result, &returned_results);
if (heuristic_status == CUBLAS_STATUS_SUCCESS && returned_results > 0) {
// cublasLt path: use the selected algorithm with lazy workspace.
const size_t needed = result.workspaceSize;
if (needed > workspace->size()) {
// Sync only when freeing an existing buffer that may be in use.
if (workspace->ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
*workspace = DeviceBuffer(needed);
}
EIGEN_CUBLASLT_CHECK(cublasLtMatmul(lt_handle, matmul_desc, alpha, A, layout_A, B, layout_B, beta, C, layout_C, C,
layout_C, &result.algo, workspace->ptr, needed, stream));
} else {
// Fallback: cublasGemmEx for shapes/types that cublasLt cannot handle.
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_handle, transA, transB, static_cast<int>(m), static_cast<int>(n),
static_cast<int>(k), alpha, A, dtype, static_cast<int>(lda), B, dtype,
static_cast<int>(ldb), beta, C, dtype, static_cast<int>(ldc), compute,
cuda_gemm_algo()));
}
// Cleanup descriptors.
EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceDestroy(preference));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_C));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_B));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_A));
EIGEN_CUBLASLT_CHECK(cublasLtMatmulDescDestroy(matmul_desc));
}
// ---- Type-specific cuBLAS wrappers ------------------------------------------
// cuBLAS uses separate functions per type (Strsm, Dtrsm, etc.).
@@ -227,6 +327,100 @@ inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cu
reinterpret_cast<const cuDoubleComplex*>(B), ldb, reinterpret_cast<cuDoubleComplex*>(C), ldc);
}
// ---- cuBLAS Level-1 wrappers ------------------------------------------------
// Type-dispatched wrappers for BLAS-1 vector operations: dot, axpy, nrm2, scal, copy.
// These work with CUBLAS_POINTER_MODE_HOST or CUBLAS_POINTER_MODE_DEVICE depending
// on the caller's configuration. For device pointer mode, scalar result pointers
// (dot, nrm2) must point to device memory.
// dot: result = x^T * y (real) or x^H * y (complex conjugate dot)
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const float* x, int incx, const float* y, int incy,
float* result) {
return cublasSdot(h, n, x, incx, y, incy, result);
}
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const double* x, int incx, const double* y, int incy,
double* result) {
return cublasDdot(h, n, x, incx, y, incy, result);
}
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const std::complex<float>* x, int incx,
const std::complex<float>* y, int incy, std::complex<float>* result) {
return cublasCdotc(h, n, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<const cuComplex*>(y), incy,
reinterpret_cast<cuComplex*>(result));
}
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const std::complex<double>* x, int incx,
const std::complex<double>* y, int incy, std::complex<double>* result) {
return cublasZdotc(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx,
reinterpret_cast<const cuDoubleComplex*>(y), incy, reinterpret_cast<cuDoubleComplex*>(result));
}
// nrm2: result = ||x||_2 (always returns real)
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const float* x, int incx, float* result) {
return cublasSnrm2(h, n, x, incx, result);
}
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const double* x, int incx, double* result) {
return cublasDnrm2(h, n, x, incx, result);
}
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const std::complex<float>* x, int incx, float* result) {
return cublasScnrm2(h, n, reinterpret_cast<const cuComplex*>(x), incx, result);
}
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const std::complex<double>* x, int incx, double* result) {
return cublasDznrm2(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx, result);
}
// axpy: y += alpha * x
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const float* alpha, const float* x, int incx, float* y,
int incy) {
return cublasSaxpy(h, n, alpha, x, incx, y, incy);
}
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const double* alpha, const double* x, int incx, double* y,
int incy) {
return cublasDaxpy(h, n, alpha, x, incx, y, incy);
}
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const std::complex<float>* alpha,
const std::complex<float>* x, int incx, std::complex<float>* y, int incy) {
return cublasCaxpy(h, n, reinterpret_cast<const cuComplex*>(alpha), reinterpret_cast<const cuComplex*>(x), incx,
reinterpret_cast<cuComplex*>(y), incy);
}
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const std::complex<double>* alpha,
const std::complex<double>* x, int incx, std::complex<double>* y, int incy) {
return cublasZaxpy(h, n, reinterpret_cast<const cuDoubleComplex*>(alpha), reinterpret_cast<const cuDoubleComplex*>(x),
incx, reinterpret_cast<cuDoubleComplex*>(y), incy);
}
// scal: x *= alpha
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const float* alpha, float* x, int incx) {
return cublasSscal(h, n, alpha, x, incx);
}
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const double* alpha, double* x, int incx) {
return cublasDscal(h, n, alpha, x, incx);
}
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const std::complex<float>* alpha, std::complex<float>* x,
int incx) {
return cublasCscal(h, n, reinterpret_cast<const cuComplex*>(alpha), reinterpret_cast<cuComplex*>(x), incx);
}
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const std::complex<double>* alpha, std::complex<double>* x,
int incx) {
return cublasZscal(h, n, reinterpret_cast<const cuDoubleComplex*>(alpha), reinterpret_cast<cuDoubleComplex*>(x),
incx);
}
// copy: y = x
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const float* x, int incx, float* y, int incy) {
return cublasScopy(h, n, x, incx, y, incy);
}
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const double* x, int incx, double* y, int incy) {
return cublasDcopy(h, n, x, incx, y, incy);
}
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const std::complex<float>* x, int incx,
std::complex<float>* y, int incy) {
return cublasCcopy(h, n, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<cuComplex*>(y), incy);
}
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const std::complex<double>* x, int incx,
std::complex<double>* y, int incy) {
return cublasZcopy(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx, reinterpret_cast<cuDoubleComplex*>(y),
incy);
}
} // namespace internal
} // namespace Eigen

View File

@@ -0,0 +1,134 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// cuDSS support utilities: error checking macro, type mapping.
//
// cuDSS is NVIDIA's sparse direct solver library, supporting Cholesky (LL^T),
// LDL^T, and LU factorization on GPU. It requires CUDA 12.0+ and is
// distributed separately from the CUDA Toolkit.
#ifndef EIGEN_GPU_CUDSS_SUPPORT_H
#define EIGEN_GPU_CUDSS_SUPPORT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSupport.h"
#include <cudss.h>
namespace Eigen {
namespace internal {
// ---- Error checking ---------------------------------------------------------
#define EIGEN_CUDSS_CHECK(x) \
do { \
cudssStatus_t _s = (x); \
eigen_assert(_s == CUDSS_STATUS_SUCCESS && "cuDSS call failed: " #x); \
EIGEN_UNUSED_VARIABLE(_s); \
} while (0)
// ---- Scalar → cudssMatrixType_t for SPD/HPD ---------------------------------
template <typename Scalar>
struct cudss_spd_type;
template <>
struct cudss_spd_type<float> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
};
template <>
struct cudss_spd_type<double> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
};
template <>
struct cudss_spd_type<std::complex<float>> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
};
template <>
struct cudss_spd_type<std::complex<double>> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
};
// ---- Scalar → cudssMatrixType_t for symmetric/Hermitian ---------------------
template <typename Scalar>
struct cudss_symmetric_type;
template <>
struct cudss_symmetric_type<float> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
};
template <>
struct cudss_symmetric_type<double> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
};
template <>
struct cudss_symmetric_type<std::complex<float>> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
};
template <>
struct cudss_symmetric_type<std::complex<double>> {
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
};
// ---- StorageIndex → cudaDataType_t ------------------------------------------
template <typename StorageIndex>
struct cudss_index_type;
template <>
struct cudss_index_type<int> {
static constexpr cudaDataType_t value = CUDA_R_32I;
};
template <>
struct cudss_index_type<int64_t> {
static constexpr cudaDataType_t value = CUDA_R_64I;
};
// ---- UpLo → cudssMatrixViewType_t -------------------------------------------
// For symmetric matrices stored as CSC (ColMajor), cuDSS sees CSR of A^T.
// Since A = A^T, the data is the same, but the triangle view must be swapped.
template <int UpLo, int StorageOrder>
struct cudss_view_type;
// ColMajor (CSC) passed as CSR: lower ↔ upper swap.
template <>
struct cudss_view_type<Lower, ColMajor> {
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
};
template <>
struct cudss_view_type<Upper, ColMajor> {
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
};
// RowMajor (CSR) passed directly: no swap needed.
template <>
struct cudss_view_type<Lower, RowMajor> {
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
};
template <>
struct cudss_view_type<Upper, RowMajor> {
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
};
} // namespace internal
// ---- Ordering enum ----------------------------------------------------------
enum class GpuSparseOrdering {
AMD, // Default fill-reducing ordering
METIS, // METIS nested dissection
RCM // Reverse Cuthill-McKee
};
} // namespace Eigen
#endif // EIGEN_GPU_CUDSS_SUPPORT_H

View File

@@ -0,0 +1,103 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// cuFFT support utilities: error checking macro, type mapping.
#ifndef EIGEN_GPU_CUFFT_SUPPORT_H
#define EIGEN_GPU_CUFFT_SUPPORT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSupport.h"
#include <cufft.h>
namespace Eigen {
namespace internal {
// ---- Error checking ---------------------------------------------------------
#define EIGEN_CUFFT_CHECK(x) \
do { \
cufftResult _r = (x); \
eigen_assert(_r == CUFFT_SUCCESS && "cuFFT call failed: " #x); \
EIGEN_UNUSED_VARIABLE(_r); \
} while (0)
// ---- Scalar → cufftType traits ----------------------------------------------
template <typename Scalar>
struct cufft_c2c_type;
template <>
struct cufft_c2c_type<float> {
static constexpr cufftType value = CUFFT_C2C;
};
template <>
struct cufft_c2c_type<double> {
static constexpr cufftType value = CUFFT_Z2Z;
};
template <typename Scalar>
struct cufft_r2c_type;
template <>
struct cufft_r2c_type<float> {
static constexpr cufftType value = CUFFT_R2C;
};
template <>
struct cufft_r2c_type<double> {
static constexpr cufftType value = CUFFT_D2Z;
};
template <typename Scalar>
struct cufft_c2r_type;
template <>
struct cufft_c2r_type<float> {
static constexpr cufftType value = CUFFT_C2R;
};
template <>
struct cufft_c2r_type<double> {
static constexpr cufftType value = CUFFT_Z2D;
};
// ---- Type-dispatched cuFFT execution ----------------------------------------
// C2C
inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<float>* in, std::complex<float>* out,
int direction) {
return cufftExecC2C(plan, reinterpret_cast<cufftComplex*>(in), reinterpret_cast<cufftComplex*>(out), direction);
}
inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<double>* in, std::complex<double>* out,
int direction) {
return cufftExecZ2Z(plan, reinterpret_cast<cufftDoubleComplex*>(in), reinterpret_cast<cufftDoubleComplex*>(out),
direction);
}
// R2C
inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, float* in, std::complex<float>* out) {
return cufftExecR2C(plan, in, reinterpret_cast<cufftComplex*>(out));
}
inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, double* in, std::complex<double>* out) {
return cufftExecD2Z(plan, in, reinterpret_cast<cufftDoubleComplex*>(out));
}
// C2R
inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<float>* in, float* out) {
return cufftExecC2R(plan, reinterpret_cast<cufftComplex*>(in), out);
}
inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<double>* in, double* out) {
return cufftExecZ2D(plan, reinterpret_cast<cufftDoubleComplex*>(in), out);
}
} // namespace internal
} // namespace Eigen
#endif // EIGEN_GPU_CUFFT_SUPPORT_H

View File

@@ -0,0 +1,34 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// cuSPARSE support utilities: error checking macro.
#ifndef EIGEN_GPU_CUSPARSE_SUPPORT_H
#define EIGEN_GPU_CUSPARSE_SUPPORT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSupport.h"
#include <cusparse.h>
namespace Eigen {
namespace internal {
#define EIGEN_CUSPARSE_CHECK(x) \
do { \
cusparseStatus_t _s = (x); \
eigen_assert(_s == CUSPARSE_STATUS_SUCCESS && "cuSPARSE call failed: " #x); \
EIGEN_UNUSED_VARIABLE(_s); \
} while (0)
} // namespace internal
} // namespace Eigen
#endif // EIGEN_GPU_CUSPARSE_SUPPORT_H

View File

@@ -29,10 +29,11 @@ namespace Eigen {
namespace internal {
// ---- GEMM dispatch ----------------------------------------------------------
// GemmExpr<Lhs, Rhs> → cublasGemmEx(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
// GemmExpr<Lhs, Rhs> → cublasLtMatmul via GpuContext.
//
// The generic API cublasGemmEx handles all scalar types (float, double,
// complex<float>, complex<double>) via cudaDataType_t.
// Uses cublasLtMatmul for 64-bit dimension support and heuristic algorithm
// selection. All scalar types (float, double, complex<float>, complex<double>)
// are handled via cudaDataType_t.
template <typename Lhs, typename Rhs>
void dispatch_gemm(
@@ -46,6 +47,10 @@ void dispatch_gemm(
const DeviceMatrix<Scalar>& A = traits_lhs::matrix(expr.lhs());
const DeviceMatrix<Scalar>& B = traits_rhs::matrix(expr.rhs());
// cuBLAS GEMM: C must not alias A or B (undefined behavior).
eigen_assert(dst.data() != A.data() && "GEMM: output aliases left operand (use a temporary)");
eigen_assert(dst.data() != B.data() && "GEMM: output aliases right operand (use a temporary)");
constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op);
constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op);
@@ -89,13 +94,8 @@ void dispatch_gemm(
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
}
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
EIGEN_CUBLAS_CHECK(cublasGemmEx(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n),
static_cast<int>(k), &alpha_gval, A.data(), dtype, static_cast<int>(lda), B.data(),
dtype, static_cast<int>(ldb), &beta_gval, dst.data(), dtype, static_cast<int>(ldc),
compute, cuda_gemm_algo()));
cublaslt_gemm<Scalar>(ctx.cublasLtHandle(), ctx.cublasHandle(), transA, transB, m, n, k, &alpha_gval, A.data(), lda,
B.data(), ldb, &beta_gval, dst.data(), ldc, ctx.gemmWorkspace(), ctx.stream());
dst.recordReady(ctx.stream());
}
@@ -504,6 +504,284 @@ void DeviceSelfAdjointView<Scalar_, UpLo_>::rankUpdate(const DeviceMatrix<Scalar
internal::dispatch_syrk(GpuContext::threadLocal(), matrix(), expr, alpha, beta);
}
// ---- DeviceMatrix BLAS-1 out-of-line definitions ----------------------------
// Defined here because they need the full GpuContext definition.
// All methods take an explicit GpuContext& so callers can ensure same-stream
// execution (zero event overhead when all operations share one context).
//
// Reduction methods (dot, norm, squaredNorm) use CUBLAS_POINTER_MODE_HOST:
// the scalar result is written to host memory and cuBLAS synchronizes
// internally before returning. This is necessary for Eigen template
// compatibility — CG does `Scalar alpha = absNew / p.dot(tmp)` which
// requires the host value immediately. A future GPU CG implementation
// that controls the iteration loop can use CUBLAS_POINTER_MODE_DEVICE
// to batch multiple reductions into a single sync point.
template <typename Scalar_>
DeviceScalar<typename DeviceMatrix<Scalar_>::Scalar> DeviceMatrix<Scalar_>::dot(GpuContext& ctx,
const DeviceMatrix& other) const {
const int n = static_cast<int>(rows_ * cols_);
eigen_assert(n == static_cast<int>(other.rows_ * other.cols_));
DeviceScalar<Scalar> result(Scalar(0), ctx.stream());
if (n > 0) {
waitReady(ctx.stream());
other.waitReady(ctx.stream());
cublasPointerMode_t prev;
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
EIGEN_CUBLAS_CHECK(internal::cublasXdot(ctx.cublasHandle(), n, data_, 1, other.data_, 1, result.devicePtr()));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
}
return result;
}
namespace internal {
// Real: dot(x,x) returns DeviceScalar<Scalar> which IS DeviceScalar<RealScalar>.
// Move-construct without any sync.
template <typename Scalar, typename RealScalar>
typename std::enable_if<std::is_same<Scalar, RealScalar>::value, DeviceScalar<RealScalar>>::type squaredNorm_from_dot(
DeviceScalar<Scalar>&& d, cudaStream_t) {
return std::move(d);
}
// Complex: must sync to extract the real part (DeviceScalar arithmetic is real-only).
template <typename Scalar, typename RealScalar>
typename std::enable_if<!std::is_same<Scalar, RealScalar>::value, DeviceScalar<RealScalar>>::type squaredNorm_from_dot(
DeviceScalar<Scalar>&& d, cudaStream_t stream) {
return DeviceScalar<RealScalar>(numext::real(Scalar(d)), stream);
}
} // namespace internal
template <typename Scalar_>
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::squaredNorm(GpuContext& ctx) const {
// Use dot(x,x) instead of nrm2()^2: dot kernel is ~4.5x faster than nrm2
// (nrm2 uses a numerically careful scaled-sum-of-squares algorithm that is
// unnecessary for CG convergence checks).
using RealScalar = typename NumTraits<Scalar_>::Real;
return internal::squaredNorm_from_dot<Scalar_, RealScalar>(dot(ctx, *this), ctx.stream());
}
template <typename Scalar_>
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::norm(GpuContext& ctx) const {
using RealScalar = typename NumTraits<Scalar>::Real;
const int n = static_cast<int>(rows_ * cols_);
DeviceScalar<RealScalar> result(RealScalar(0), ctx.stream());
if (n > 0) {
waitReady(ctx.stream());
cublasPointerMode_t prev;
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
EIGEN_CUBLAS_CHECK(internal::cublasXnrm2(ctx.cublasHandle(), n, data_, 1, result.devicePtr()));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
}
return result;
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::setZero(GpuContext& ctx) {
if (sizeInBytes() > 0) {
waitReady(ctx.stream());
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(data_, 0, sizeInBytes(), ctx.stream()));
recordReady(ctx.stream());
}
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::addScaled(GpuContext& ctx, Scalar alpha, const DeviceMatrix& x) {
const int n = static_cast<int>(rows_ * cols_);
eigen_assert(n == static_cast<int>(x.rows_ * x.cols_));
if (n > 0) {
waitReady(ctx.stream());
x.waitReady(ctx.stream());
EIGEN_CUBLAS_CHECK(internal::cublasXaxpy(ctx.cublasHandle(), n, &alpha, x.data_, 1, data_, 1));
recordReady(ctx.stream());
}
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::scale(GpuContext& ctx, Scalar alpha) {
const int n = static_cast<int>(rows_ * cols_);
if (n > 0) {
waitReady(ctx.stream());
EIGEN_CUBLAS_CHECK(internal::cublasXscal(ctx.cublasHandle(), n, &alpha, data_, 1));
recordReady(ctx.stream());
}
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::copyFrom(GpuContext& ctx, const DeviceMatrix& other) {
// Wait on *this before resize — resize may free the old buffer while another
// stream is still reading it.
if (!empty()) waitReady(ctx.stream());
resize(other.rows_, other.cols_);
const int n = static_cast<int>(rows_ * cols_);
if (n > 0) {
other.waitReady(ctx.stream());
EIGEN_CUBLAS_CHECK(internal::cublasXcopy(ctx.cublasHandle(), n, other.data_, 1, data_, 1));
recordReady(ctx.stream());
}
}
// ---- BLAS-1 operator overloads for CG compatibility -------------------------
// this += alpha * x (axpy)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceScaled<DeviceMatrix>& expr) {
addScaled(GpuContext::threadLocal(), expr.scalar(), internal::device_expr_traits<DeviceMatrix>::matrix(expr.inner()));
return *this;
}
// this -= alpha * x (axpy with negated alpha)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceScaled<DeviceMatrix>& expr) {
addScaled(GpuContext::threadLocal(), -expr.scalar(),
internal::device_expr_traits<DeviceMatrix>::matrix(expr.inner()));
return *this;
}
// this += x (axpy with alpha=1)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceMatrix& other) {
Scalar one(1);
addScaled(GpuContext::threadLocal(), one, other);
return *this;
}
// this -= x (axpy with alpha=-1)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceMatrix& other) {
Scalar neg_one(-1);
addScaled(GpuContext::threadLocal(), neg_one, other);
return *this;
}
// this *= alpha (scal, host pointer)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator*=(Scalar alpha) {
scale(GpuContext::threadLocal(), alpha);
return *this;
}
// this *= alpha (scal, device pointer — avoids host sync)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator*=(const DeviceScalar<Scalar>& alpha) {
const int n = static_cast<int>(rows_ * cols_);
if (n > 0) {
auto& ctx = GpuContext::threadLocal();
waitReady(ctx.stream());
cublasPointerMode_t prev;
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
EIGEN_CUBLAS_CHECK(internal::cublasXscal(ctx.cublasHandle(), n, alpha.devicePtr(), data_, 1));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
recordReady(ctx.stream());
}
return *this;
}
// this += DeviceScalar * x (axpy with CUBLAS_POINTER_MODE_DEVICE)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceScaledDevice<Scalar_>& expr) {
const int n = static_cast<int>(rows_ * cols_);
const auto& x = expr.matrix();
eigen_assert(n == static_cast<int>(x.rows_ * x.cols_));
if (n > 0) {
auto& ctx = GpuContext::threadLocal();
waitReady(ctx.stream());
x.waitReady(ctx.stream());
cublasPointerMode_t prev;
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
EIGEN_CUBLAS_CHECK(internal::cublasXaxpy(ctx.cublasHandle(), n, expr.alpha().devicePtr(), x.data_, 1, data_, 1));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
recordReady(ctx.stream());
}
return *this;
}
// this -= DeviceScalar * x (axpy with negated device scalar)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceScaledDevice<Scalar_>& expr) {
auto neg_alpha = -expr.alpha();
DeviceScaledDevice<Scalar_> neg_expr(neg_alpha, expr.matrix());
return operator+=(neg_expr);
}
// this = alpha * A + beta * B (cuBLAS geam)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const DeviceAddExpr<Scalar_>& expr) {
auto& ctx = GpuContext::threadLocal();
const auto& A = expr.A();
const auto& B = expr.B();
eigen_assert(A.rows() == B.rows() && A.cols() == B.cols());
const int m = static_cast<int>(A.rows());
const int n = static_cast<int>(A.cols());
// Wait on *this before resize — resize may free the old buffer while another
// stream is still reading it.
if (!empty()) waitReady(ctx.stream());
resize(A.rows(), A.cols());
if (m > 0 && n > 0) {
A.waitReady(ctx.stream());
B.waitReady(ctx.stream());
Scalar_ alpha = expr.alpha();
Scalar_ beta = expr.beta();
EIGEN_CUBLAS_CHECK(internal::cublasXgeam(ctx.cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, &alpha, A.data(), m,
&beta, B.data(), m, data_, m));
recordReady(ctx.stream());
}
return *this;
}
// cwiseProduct via NPP nppsMul (allocating).
template <typename Scalar_>
DeviceMatrix<Scalar_> DeviceMatrix<Scalar_>::cwiseProduct(GpuContext& ctx, const DeviceMatrix& other) const {
const int n = static_cast<int>(rows_ * cols_);
eigen_assert(n == static_cast<int>(other.rows_ * other.cols_));
DeviceMatrix result(rows_, cols_);
if (n > 0) {
waitReady(ctx.stream());
other.waitReady(ctx.stream());
internal::device_cwiseProduct(data_, other.data_, result.data_, n, ctx.stream());
result.recordReady(ctx.stream());
}
return result;
}
// In-place cwiseProduct: this = a .* b (reuses this buffer, no allocation).
template <typename Scalar_>
void DeviceMatrix<Scalar_>::cwiseProduct(GpuContext& ctx, const DeviceMatrix& a, const DeviceMatrix& b) {
const int n = static_cast<int>(a.rows_ * a.cols_);
eigen_assert(n == static_cast<int>(b.rows_ * b.cols_));
if (!empty()) waitReady(ctx.stream());
resize(a.rows_, a.cols_);
if (n > 0) {
a.waitReady(ctx.stream());
b.waitReady(ctx.stream());
internal::device_cwiseProduct(a.data_, b.data_, data_, n, ctx.stream());
recordReady(ctx.stream());
}
}
// Convenience overloads using thread-local default GpuContext.
template <typename Scalar_>
DeviceScalar<typename DeviceMatrix<Scalar_>::Scalar> DeviceMatrix<Scalar_>::dot(const DeviceMatrix& other) const {
return dot(GpuContext::threadLocal(), other);
}
template <typename Scalar_>
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::squaredNorm() const {
return squaredNorm(GpuContext::threadLocal());
}
template <typename Scalar_>
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::norm() const {
return norm(GpuContext::threadLocal());
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::setZero() {
setZero(GpuContext::threadLocal());
}
} // namespace Eigen
#endif // EIGEN_GPU_DEVICE_DISPATCH_H

View File

@@ -219,6 +219,87 @@ DeviceScaled<DeviceTransposeView<S>> operator*(S alpha, const DeviceTransposeVie
return {alpha, m};
}
// ---- DeviceScaledDevice: DeviceScalar * DeviceMatrix → device-pointer axpy ---
// Like DeviceScaled but carries a DeviceScalar (device pointer) instead of
// a host scalar. operator+= dispatches to cuBLAS axpy with POINTER_MODE_DEVICE.
template <typename Scalar_>
class DeviceScaledDevice {
public:
using Scalar = Scalar_;
DeviceScaledDevice(const DeviceScalar<Scalar>& alpha, const DeviceMatrix<Scalar>& mat) : alpha_(alpha), mat_(mat) {}
const DeviceScalar<Scalar>& alpha() const { return alpha_; }
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
private:
const DeviceScalar<Scalar>& alpha_;
const DeviceMatrix<Scalar>& mat_;
};
// DeviceScalar * DeviceMatrix → DeviceScaledDevice
template <typename S>
DeviceScaledDevice<S> operator*(const DeviceScalar<S>& alpha, const DeviceMatrix<S>& m) {
return {alpha, m};
}
// ---- DeviceAddExpr: a + b → cublasXgeam -------------------------------------
// Captures `DeviceMatrix + DeviceScaled<DeviceMatrix>` (and reverse).
// Dispatched to geam: C = alpha * A + beta * B.
//
// Note: These operator+/- overloads are intentionally free functions on
// DeviceMatrix, not Eigen expression templates. DeviceMatrix does not inherit
// from MatrixBase, so there is no ambiguity with Eigen's own operator+/-.
// If DeviceMatrix is ever made an Eigen expression type, these would need to
// be revisited.
template <typename Scalar_>
class DeviceAddExpr {
public:
using Scalar = Scalar_;
DeviceAddExpr(Scalar alpha, const DeviceMatrix<Scalar>& A, Scalar beta, const DeviceMatrix<Scalar>& B)
: alpha_(alpha), A_(A), beta_(beta), B_(B) {}
Scalar alpha() const { return alpha_; }
Scalar beta() const { return beta_; }
const DeviceMatrix<Scalar>& A() const { return A_; }
const DeviceMatrix<Scalar>& B() const { return B_; }
private:
Scalar alpha_;
const DeviceMatrix<Scalar>& A_;
Scalar beta_;
const DeviceMatrix<Scalar>& B_;
};
// DeviceMatrix + DeviceMatrix → DeviceAddExpr (alpha=1, beta=1)
template <typename S>
DeviceAddExpr<S> operator+(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
return {S(1), a, S(1), b};
}
// DeviceMatrix + DeviceScaled<DeviceMatrix> → DeviceAddExpr (alpha=1, beta=scaled)
template <typename S>
DeviceAddExpr<S> operator+(const DeviceMatrix<S>& a, const DeviceScaled<DeviceMatrix<S>>& b) {
return {S(1), a, b.scalar(), b.inner()};
}
// DeviceScaled<DeviceMatrix> + DeviceMatrix → DeviceAddExpr (alpha=scaled, beta=1)
template <typename S>
DeviceAddExpr<S> operator+(const DeviceScaled<DeviceMatrix<S>>& a, const DeviceMatrix<S>& b) {
return {a.scalar(), a.inner(), S(1), b};
}
// DeviceMatrix - DeviceMatrix → DeviceAddExpr (alpha=1, beta=-1)
template <typename S>
DeviceAddExpr<S> operator-(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
return {S(1), a, S(-1), b};
}
// DeviceMatrix - DeviceScaled<DeviceMatrix> → DeviceAddExpr (alpha=1, beta=-scaled)
template <typename S>
DeviceAddExpr<S> operator-(const DeviceMatrix<S>& a, const DeviceScaled<DeviceMatrix<S>>& b) {
return {S(1), a, -b.scalar(), b.inner()};
}
} // namespace Eigen
#endif // EIGEN_GPU_DEVICE_EXPR_H

View File

@@ -53,6 +53,16 @@ template <typename>
class DeviceAssignment;
template <typename, typename>
class GemmExpr;
template <typename>
class DeviceScaled;
template <typename>
class SpMVExpr;
template <typename>
class DeviceAddExpr;
template <typename>
class DeviceScaledDevice;
template <typename>
class DeviceScalar;
template <typename, int>
class LltSolveExpr;
template <typename>
@@ -170,6 +180,8 @@ template <typename Scalar_>
class DeviceMatrix {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using PlainObject = DeviceMatrix; // owning type (for CG template compatibility)
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
// ---- Construction / destruction ------------------------------------------
@@ -177,6 +189,16 @@ class DeviceMatrix {
/** Default: empty (0x0, no allocation). */
DeviceMatrix() = default;
/** Allocate uninitialized column vector of given size.
* Matches Matrix<Scalar,Dynamic,1>(n) for CG template compatibility. */
explicit DeviceMatrix(Index n) : rows_(n), cols_(1) {
eigen_assert(n >= 0);
size_t bytes = sizeInBytes();
if (bytes > 0) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
}
}
/** Allocate uninitialized device memory for a rows x cols matrix. */
DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols) {
eigen_assert(rows >= 0 && cols >= 0);
@@ -446,6 +468,104 @@ class DeviceMatrix {
template <int UpLo>
DeviceMatrix& operator=(const SymmExpr<Scalar, UpLo>& expr);
// ---- BLAS Level-1 operations ----------------------------------------------
// DeviceMatrix is always dense (lda == rows), so a vector is simply a
// DeviceMatrix with cols == 1. These BLAS-1 methods operate on the flat
// rows*cols element array, making them work for both vectors and matrices.
//
// All methods take an explicit GpuContext& for stream/handle control.
// When everything uses the same context, event waits are skipped (same-stream).
// Defined out-of-line in DeviceDispatch.h (needs GpuContext).
/** Dot product: this^H * other. Returns DeviceScalar — the result stays
* on device until read via implicit conversion to Scalar (which syncs).
* When used with `auto`, no sync occurs until the value is needed. */
DeviceScalar<Scalar> dot(GpuContext& ctx, const DeviceMatrix& other) const;
/** Squared L2 norm via dot(x, x). Returns DeviceScalar (no sync until read).
* For real types, the result stays on device. For complex types, falls back
* to host sync (DeviceScalar arithmetic is real-only). */
DeviceScalar<typename NumTraits<Scalar>::Real> squaredNorm(GpuContext& ctx) const;
/** L2 norm. Returns DeviceScalar (no host sync). */
DeviceScalar<typename NumTraits<Scalar>::Real> norm(GpuContext& ctx) const;
/** Set all elements to zero. */
void setZero(GpuContext& ctx);
/** this += alpha * x (cuBLAS axpy). Requires same total size. */
void addScaled(GpuContext& ctx, Scalar alpha, const DeviceMatrix& x);
/** this *= alpha (cuBLAS scal). */
void scale(GpuContext& ctx, Scalar alpha);
/** Deep copy: this = other (cuBLAS copy). Resizes if needed. */
void copyFrom(GpuContext& ctx, const DeviceMatrix& other);
// Convenience overloads using the thread-local default GpuContext.
DeviceScalar<Scalar> dot(const DeviceMatrix& other) const;
DeviceScalar<typename NumTraits<Scalar>::Real> squaredNorm() const;
DeviceScalar<typename NumTraits<Scalar>::Real> norm() const;
void setZero();
// ---- BLAS-1 operator overloads for CG/iterative solver compatibility ------
// These allow CG code like `x += alpha * p` to work with DeviceMatrix.
// `alpha * DeviceMatrix` already returns `DeviceScaled<DeviceMatrix<Scalar>>`
// (defined in DeviceExpr.h). These operators dispatch to cuBLAS axpy/scal.
// Defined out-of-line in DeviceDispatch.h.
/** this += alpha * x (cuBLAS axpy). For `x += alpha * p`. */
DeviceMatrix& operator+=(const DeviceScaled<DeviceMatrix>& expr);
/** this -= alpha * x (cuBLAS axpy with negated alpha). For `r -= alpha * tmp`. */
DeviceMatrix& operator-=(const DeviceScaled<DeviceMatrix>& expr);
/** this += x (cuBLAS axpy with alpha=1). */
DeviceMatrix& operator+=(const DeviceMatrix& other);
/** this -= x (cuBLAS axpy with alpha=-1). */
DeviceMatrix& operator-=(const DeviceMatrix& other);
/** this *= alpha (cuBLAS scal, host pointer mode). For `p *= beta`. */
DeviceMatrix& operator*=(Scalar alpha);
/** this *= alpha (cuBLAS scal, device pointer mode). Avoids host sync. */
DeviceMatrix& operator*=(const DeviceScalar<Scalar>& alpha);
/** Element-wise product: result[i] = this[i] * other[i] (NPP nppsMul).
* Returns a new DeviceMatrix. Defined out-of-line in DeviceDispatch.h. */
DeviceMatrix cwiseProduct(GpuContext& ctx, const DeviceMatrix& other) const;
/** In-place element-wise product: this[i] = a[i] * b[i] (NPP nppsMul).
* Reuses this matrix's buffer when sizes match, avoiding cudaMalloc. */
void cwiseProduct(GpuContext& ctx, const DeviceMatrix& a, const DeviceMatrix& b);
/** this += DeviceScalar * x (cuBLAS axpy with POINTER_MODE_DEVICE). */
DeviceMatrix& operator+=(const DeviceScaledDevice<Scalar>& expr);
/** this -= DeviceScalar * x (cuBLAS axpy with negated device scalar). */
DeviceMatrix& operator-=(const DeviceScaledDevice<Scalar>& expr);
/** Assign from an SpMV expression: d_y = d_A * d_x. */
DeviceMatrix& operator=(const SpMVExpr<Scalar>& expr);
/** Assign from an add expression: d_C = alpha * d_A + beta * d_B (cuBLAS geam). */
DeviceMatrix& operator=(const DeviceAddExpr<Scalar>& expr);
/** No-op — all DeviceMatrix operations are implicitly noalias.
*
* Unlike Eigen's Matrix, where omitting .noalias() triggers a copy to a
* temporary for safety, DeviceMatrix dispatches directly to NVIDIA library
* calls which have no built-in aliasing protection. Every assignment
* (`d_C = d_A * d_B`, `d_y = d_A * d_x`, etc.) behaves as if .noalias()
* were specified. The caller must ensure operands don't alias the
* destination for GEMM and SpMV. geam (`d_C = d_A + alpha * d_B`) is
* safe with aliasing. Debug asserts catch violations.
*
* This method exists so that `tmp.noalias() = mat * p` compiles for both
* Matrix and DeviceMatrix. */
DeviceMatrix& noalias() { return *this; }
private:
// ---- Private: adopt a raw device pointer (used by friend solvers) --------

View File

@@ -0,0 +1,121 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Device-resident scalar for deferred host synchronization.
//
// DeviceScalar<Scalar> wraps a single value in device memory. Reductions
// (dot, nrm2) write results directly to device memory via
// CUBLAS_POINTER_MODE_DEVICE, deferring host sync until the value is read.
//
// Implicit conversion to Scalar triggers cudaStreamSynchronize + download.
// In CG, this reduces 3 syncs/iter to effectively 1: the first conversion
// syncs the stream, subsequent conversions in the same expression just
// download (the stream is already flushed).
//
// Usage:
// auto dot_val = d_x.dot(d_y); // DeviceScalar, no sync
// auto norm_val = d_r.squaredNorm(); // DeviceScalar, no sync
// Scalar alpha = absNew / dot_val; // sync here (both values downloaded)
// d_x += alpha * d_p; // host-scalar axpy (as before)
#ifndef EIGEN_GPU_DEVICE_SCALAR_H
#define EIGEN_GPU_DEVICE_SCALAR_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSupport.h"
#include "./DeviceScalarOps.h"
namespace Eigen {
template <typename Scalar_>
class DeviceScalar {
public:
using Scalar = Scalar_;
/** Allocate uninitialized device scalar. Contents are undefined until written
* (e.g., by cuBLAS dot/nrm2 with POINTER_MODE_DEVICE). Consistent with
* DeviceMatrix(rows, cols) which also does not zero-initialize. */
explicit DeviceScalar(cudaStream_t stream = nullptr) : d_val_(sizeof(Scalar)), stream_(stream) {}
DeviceScalar(Scalar host_val, cudaStream_t stream) : d_val_(sizeof(Scalar)), stream_(stream) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_val_.ptr, &host_val, sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
}
DeviceScalar(DeviceScalar&& o) noexcept : d_val_(std::move(o.d_val_)), stream_(o.stream_) { o.stream_ = nullptr; }
DeviceScalar& operator=(DeviceScalar&& o) noexcept {
if (this != &o) {
d_val_ = std::move(o.d_val_);
stream_ = o.stream_;
o.stream_ = nullptr;
}
return *this;
}
DeviceScalar(const DeviceScalar&) = delete;
DeviceScalar& operator=(const DeviceScalar&) = delete;
/** Download from device. Synchronizes the stream on first call;
* subsequent calls in the same expression are cheap (stream already flushed). */
Scalar get() const {
Scalar result;
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&result, d_val_.ptr, sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
/** Implicit conversion — allows `Scalar alpha = deviceScalar` and
* `if (deviceScalar < threshold)`. Triggers sync. */
operator Scalar() const { return get(); }
Scalar* devicePtr() { return static_cast<Scalar*>(d_val_.ptr); }
const Scalar* devicePtr() const { return static_cast<const Scalar*>(d_val_.ptr); }
cudaStream_t stream() const { return stream_; }
// ---- Device-side arithmetic (no host sync) ---------------------------------
// Uses NPP from DeviceScalarOps.h. All results stay on device.
// Currently supports real types only (float, double). Complex types
// fall back to implicit conversion (host sync) for division.
//
// Note: DeviceScalar has no cross-stream readiness tracking. All
// operations must be on the same CUDA stream. This is the natural
// pattern in iterative solvers where one GpuContext owns all work.
friend DeviceScalar operator/(const DeviceScalar& a, const DeviceScalar& b) {
DeviceScalar result(a.stream_);
internal::device_scalar_div(a.devicePtr(), b.devicePtr(), result.devicePtr(), a.stream_);
return result;
}
friend DeviceScalar operator/(Scalar a, const DeviceScalar& b) {
DeviceScalar d_a(a, b.stream_);
return d_a / b;
}
friend DeviceScalar operator/(const DeviceScalar& a, Scalar b) {
DeviceScalar d_b(b, a.stream_);
return a / d_b;
}
DeviceScalar operator-() const {
DeviceScalar result(stream_);
internal::device_scalar_neg(devicePtr(), result.devicePtr(), stream_);
return result;
}
private:
internal::DeviceBuffer d_val_;
cudaStream_t stream_ = nullptr;
};
} // namespace Eigen
#endif // EIGEN_GPU_DEVICE_SCALAR_H

View File

@@ -0,0 +1,117 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Device-resident scalar and element-wise operations via NPP signals.
// Header-only — no custom CUDA kernels needed. Uses nppsDiv, nppsMul,
// nppsMulC from the NPP library (CUDA::npps, part of the CUDA toolkit).
#ifndef EIGEN_GPU_DEVICE_SCALAR_OPS_H
#define EIGEN_GPU_DEVICE_SCALAR_OPS_H
#include <cuda_runtime.h>
#include <npps_arithmetic_and_logical_operations.h>
namespace Eigen {
namespace internal {
// ---- NppStreamContext helper ------------------------------------------------
inline NppStreamContext make_npp_stream_ctx(cudaStream_t stream) {
// Cache device attributes (constant for process lifetime) in a thread-local.
// Only the stream and its flags vary per call.
struct CachedDeviceInfo {
bool initialized = false;
int device_id = 0;
int cc_major = 0;
int cc_minor = 0;
int mp_count = 0;
int max_threads_per_mp = 0;
int max_threads_per_block = 0;
int shared_mem_per_block = 0;
void init() {
if (initialized) return;
cudaGetDevice(&device_id);
cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device_id);
cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device_id);
cudaDeviceGetAttribute(&mp_count, cudaDevAttrMultiProcessorCount, device_id);
cudaDeviceGetAttribute(&max_threads_per_mp, cudaDevAttrMaxThreadsPerMultiProcessor, device_id);
cudaDeviceGetAttribute(&max_threads_per_block, cudaDevAttrMaxThreadsPerBlock, device_id);
cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id);
initialized = true;
}
};
thread_local CachedDeviceInfo cached;
cached.init();
NppStreamContext ctx = {};
ctx.hStream = stream;
ctx.nCudaDeviceId = cached.device_id;
ctx.nCudaDevAttrComputeCapabilityMajor = cached.cc_major;
ctx.nCudaDevAttrComputeCapabilityMinor = cached.cc_minor;
ctx.nMultiProcessorCount = cached.mp_count;
ctx.nMaxThreadsPerMultiProcessor = cached.max_threads_per_mp;
ctx.nMaxThreadsPerBlock = cached.max_threads_per_block;
ctx.nSharedMemPerBlock = cached.shared_mem_per_block;
cudaStreamGetFlags(stream, &ctx.nStreamFlags);
return ctx;
}
// ---- Scalar division: c = a / b (device-resident, async) --------------------
inline void device_scalar_div(const float* a, const float* b, float* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_32f_Ctx(b, a, c, 1, npp_ctx); // NPP: pDst[i] = pSrc2[i] / pSrc1[i]
}
inline void device_scalar_div(const double* a, const double* b, double* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_64f_Ctx(b, a, c, 1, npp_ctx); // NPP: pDst[i] = pSrc2[i] / pSrc1[i]
}
// ---- Scalar negation: c = -a (device-resident, async) -----------------------
inline void device_scalar_neg(const float* a, float* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMulC_32f_Ctx(a, -1.0f, c, 1, npp_ctx);
}
inline void device_scalar_neg(const double* a, double* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMulC_64f_Ctx(a, -1.0, c, 1, npp_ctx);
}
// ---- Element-wise vector multiply: c[i] = a[i] * b[i] ----------------------
inline void device_cwiseProduct(const float* a, const float* b, float* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMul_32f_Ctx(a, b, c, static_cast<size_t>(n), npp_ctx);
}
inline void device_cwiseProduct(const double* a, const double* b, double* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMul_64f_Ctx(a, b, c, static_cast<size_t>(n), npp_ctx);
}
// ---- Element-wise vector division: c[i] = a[i] / b[i] ----------------------
inline void device_cwiseQuotient(const float* a, const float* b, float* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_32f_Ctx(b, a, c, static_cast<size_t>(n), npp_ctx); // NPP: dst = src2 / src1
}
inline void device_cwiseQuotient(const double* a, const double* b, double* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_64f_Ctx(b, a, c, static_cast<size_t>(n), npp_ctx);
}
} // namespace internal
} // namespace Eigen
#endif // EIGEN_GPU_DEVICE_SCALAR_OPS_H

View File

@@ -28,6 +28,8 @@
#include "./CuBlasSupport.h"
#include "./CuSolverSupport.h"
#include <cusparse.h>
#include <cufft.h>
namespace Eigen {
@@ -44,38 +46,92 @@ namespace Eigen {
*/
class GpuContext {
public:
GpuContext() {
/** Create a new context with a dedicated CUDA stream. */
GpuContext() : owns_stream_(true) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
init_handles();
}
/** Create a context on an existing stream (e.g., stream 0 = nullptr).
* The caller retains ownership of the stream — this context will not destroy it. */
explicit GpuContext(cudaStream_t stream) : stream_(stream), owns_stream_(false) { init_handles(); }
~GpuContext() {
if (cusparse_) (void)cusparseDestroy(cusparse_);
if (cusolver_) (void)cusolverDnDestroy(cusolver_);
if (cublas_lt_) (void)cublasLtDestroy(cublas_lt_);
if (cublas_) (void)cublasDestroy(cublas_);
if (stream_) (void)cudaStreamDestroy(stream_);
if (owns_stream_ && stream_) (void)cudaStreamDestroy(stream_);
}
// Non-copyable, non-movable (owns library handles).
GpuContext(const GpuContext&) = delete;
GpuContext& operator=(const GpuContext&) = delete;
/** Lazily-created thread-local default context. */
/** Get the thread-local default context.
* If setThreadLocal() has been called, returns that context.
* Otherwise lazily creates a new context with a dedicated stream. */
static GpuContext& threadLocal() {
GpuContext* override = tl_override_ptr();
if (override) return *override;
thread_local GpuContext ctx;
return ctx;
}
/** Override the thread-local default context for this thread.
* The caller retains ownership of \p ctx — it must outlive all uses.
* Pass nullptr to restore the lazily-created default. */
static void setThreadLocal(GpuContext* ctx) { tl_override_ptr() = ctx; }
cudaStream_t stream() const { return stream_; }
cublasHandle_t cublasHandle() const { return cublas_; }
cusolverDnHandle_t cusolverHandle() const { return cusolver_; }
/** cuBLASLt handle (lazy-initialized on first GEMM call). */
cublasLtHandle_t cublasLtHandle() const {
if (!cublas_lt_) {
EIGEN_CUBLAS_CHECK(cublasLtCreate(&cublas_lt_));
}
return cublas_lt_;
}
/** Workspace buffer for cublasLtMatmul (grown lazily by cublaslt_gemm).
* Not thread-safe — all GEMM calls must be on this context's stream. */
internal::DeviceBuffer* gemmWorkspace() const { return &gemm_workspace_; }
/** cuSPARSE handle (lazy-initialized on first call). */
cusparseHandle_t cusparseHandle() const {
if (!cusparse_) {
cusparseStatus_t s1 = cusparseCreate(&cusparse_);
eigen_assert(s1 == CUSPARSE_STATUS_SUCCESS && "cusparseCreate failed");
EIGEN_UNUSED_VARIABLE(s1);
cusparseStatus_t s2 = cusparseSetStream(cusparse_, stream_);
eigen_assert(s2 == CUSPARSE_STATUS_SUCCESS && "cusparseSetStream failed");
EIGEN_UNUSED_VARIABLE(s2);
}
return cusparse_;
}
private:
cudaStream_t stream_ = nullptr;
cublasHandle_t cublas_ = nullptr;
cusolverDnHandle_t cusolver_ = nullptr;
mutable cublasLtHandle_t cublas_lt_ = nullptr; // lazy
mutable cusparseHandle_t cusparse_ = nullptr; // lazy
mutable internal::DeviceBuffer gemm_workspace_; // lazy
bool owns_stream_ = true;
static GpuContext*& tl_override_ptr() {
thread_local GpuContext* ptr = nullptr;
return ptr;
}
void init_handles() {
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
}
};
} // namespace Eigen

View File

@@ -116,6 +116,10 @@ class GpuSelfAdjointEigenSolver {
Index cols() const { return n_; }
Index rows() const { return n_; }
// TODO: Add device-side accessors (deviceEigenvalues(), deviceEigenvectors())
// returning DeviceMatrix views of the internal buffers, so users can chain
// GPU operations without round-tripping through host memory.
/** Eigenvalues in ascending order. Downloads from device. */
RealVector eigenvalues() const {
sync_info();

308
Eigen/src/GPU/GpuFFT.h Normal file
View File

@@ -0,0 +1,308 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU FFT via cuFFT.
//
// Standalone GPU FFT class with plan caching. Supports 1D and 2D transforms:
// C2C (complex-to-complex), R2C (real-to-complex), C2R (complex-to-real).
//
// Inverse transforms are scaled by 1/n (1D) or 1/(n*m) (2D) so that
// inv(fwd(x)) == x, matching Eigen's FFT convention.
//
// cuFFT plans are cached by (size, type) and reused across calls.
//
// Usage:
// GpuFFT<float> fft;
// VectorXcf X = fft.fwd(x); // 1D C2C or R2C
// VectorXcf y = fft.inv(X); // 1D C2C inverse
// VectorXf r = fft.invReal(X, n); // 1D C2R inverse
// MatrixXcf B = fft.fwd2d(A); // 2D C2C forward
// MatrixXcf C = fft.inv2d(B); // 2D C2C inverse
#ifndef EIGEN_GPU_FFT_H
#define EIGEN_GPU_FFT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuFftSupport.h"
#include "./CuBlasSupport.h"
#include <map>
namespace Eigen {
template <typename Scalar_>
class GpuFFT {
public:
using Scalar = Scalar_;
using Complex = std::complex<Scalar>;
using ComplexVector = Matrix<Complex, Dynamic, 1>;
using RealVector = Matrix<Scalar, Dynamic, 1>;
using ComplexMatrix = Matrix<Complex, Dynamic, Dynamic, ColMajor>;
GpuFFT() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
}
~GpuFFT() {
for (auto& kv : plans_) (void)cufftDestroy(kv.second);
if (cublas_) (void)cublasDestroy(cublas_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
GpuFFT(const GpuFFT&) = delete;
GpuFFT& operator=(const GpuFFT&) = delete;
// ---- 1D Complex-to-Complex ------------------------------------------------
/** Forward 1D C2C FFT. */
template <typename Derived>
ComplexVector fwd(const MatrixBase<Derived>& x,
typename std::enable_if<NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
const ComplexVector input(x.derived());
const int n = static_cast<int>(input.size());
if (n == 0) return ComplexVector(0);
ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
ComplexVector result(n);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
/** Inverse 1D C2C FFT. Scaled by 1/n. */
template <typename Derived>
ComplexVector inv(const MatrixBase<Derived>& X) {
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv() requires complex input");
const ComplexVector input(X.derived());
const int n = static_cast<int>(input.size());
if (n == 0) return ComplexVector(0);
ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
// Scale by 1/n.
scale_device(static_cast<Complex*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
ComplexVector result(n);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
// ---- 1D Real-to-Complex ---------------------------------------------------
/** Forward 1D R2C FFT. Returns n/2+1 complex values (half-spectrum). */
template <typename Derived>
ComplexVector fwd(const MatrixBase<Derived>& x,
typename std::enable_if<!NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
const RealVector input(x.derived());
const int n = static_cast<int>(input.size());
if (n == 0) return ComplexVector(0);
const int n_complex = n / 2 + 1;
ensure_buffers(n * sizeof(Scalar), n_complex * sizeof(Complex));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_1d(n, internal::cufft_r2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(
internal::cufftExecR2C_dispatch(plan, static_cast<Scalar*>(d_in_.ptr), static_cast<Complex*>(d_out_.ptr)));
ComplexVector result(n_complex);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(result.data(), d_out_.ptr, n_complex * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
// ---- 1D Complex-to-Real ---------------------------------------------------
/** Inverse 1D C2R FFT. Input is n/2+1 complex values, output is nfft real values.
* Scaled by 1/nfft. Caller must specify nfft (original real signal length). */
template <typename Derived>
RealVector invReal(const MatrixBase<Derived>& X, Index nfft) {
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "invReal() requires complex input");
const ComplexVector input(X.derived());
const int n = static_cast<int>(nfft);
const int n_complex = n / 2 + 1;
eigen_assert(input.size() == n_complex);
if (n == 0) return RealVector(0);
ensure_buffers(n_complex * sizeof(Complex), n * sizeof(Scalar));
// cuFFT C2R may overwrite the input, so we copy to d_in_.
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_in_.ptr, input.data(), n_complex * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_1d(n, internal::cufft_c2r_type<Scalar>::value);
EIGEN_CUFFT_CHECK(
internal::cufftExecC2R_dispatch(plan, static_cast<Complex*>(d_in_.ptr), static_cast<Scalar*>(d_out_.ptr)));
// Scale by 1/n.
scale_device_real(static_cast<Scalar*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
RealVector result(n);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
// ---- 2D Complex-to-Complex ------------------------------------------------
/** Forward 2D C2C FFT. Input and output are rows x cols complex matrices. */
template <typename Derived>
ComplexMatrix fwd2d(const MatrixBase<Derived>& A) {
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "fwd2d() requires complex input");
const ComplexMatrix input(A.derived());
const int rows = static_cast<int>(input.rows());
const int cols = static_cast<int>(input.cols());
if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
ensure_buffers(total, total);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
ComplexMatrix result(rows, cols);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
/** Inverse 2D C2C FFT. Scaled by 1/(rows*cols). */
template <typename Derived>
ComplexMatrix inv2d(const MatrixBase<Derived>& A) {
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv2d() requires complex input");
const ComplexMatrix input(A.derived());
const int rows = static_cast<int>(input.rows());
const int cols = static_cast<int>(input.cols());
if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
ensure_buffers(total, total);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
// Scale by 1/(rows*cols).
const int total_elems = rows * cols;
scale_device(static_cast<Complex*>(d_out_.ptr), total_elems, Scalar(1) / Scalar(total_elems));
ComplexMatrix result(rows, cols);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
// ---- Accessors ------------------------------------------------------------
cudaStream_t stream() const { return stream_; }
private:
cudaStream_t stream_ = nullptr;
cublasHandle_t cublas_ = nullptr;
std::map<int64_t, cufftHandle> plans_;
internal::DeviceBuffer d_in_;
internal::DeviceBuffer d_out_;
size_t d_in_size_ = 0;
size_t d_out_size_ = 0;
void ensure_buffers(size_t in_bytes, size_t out_bytes) {
if (in_bytes > d_in_size_) {
if (d_in_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
d_in_ = internal::DeviceBuffer(in_bytes);
d_in_size_ = in_bytes;
}
if (out_bytes > d_out_size_) {
if (d_out_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
d_out_ = internal::DeviceBuffer(out_bytes);
d_out_size_ = out_bytes;
}
}
// Plan key encoding: rank (1 bit) | type (4 bits) | dims
static int64_t plan_key_1d(int n, cufftType type) { return (int64_t(n) << 5) | (int64_t(type) << 1) | 0; }
static int64_t plan_key_2d(int rows, int cols, cufftType type) {
return (int64_t(rows) << 35) | (int64_t(cols) << 5) | (int64_t(type) << 1) | 1;
}
cufftHandle get_plan_1d(int n, cufftType type) {
int64_t key = plan_key_1d(n, type);
auto it = plans_.find(key);
if (it != plans_.end()) return it->second;
cufftHandle plan;
EIGEN_CUFFT_CHECK(cufftPlan1d(&plan, n, type, /*batch=*/1));
EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
plans_[key] = plan;
return plan;
}
cufftHandle get_plan_2d(int rows, int cols, cufftType type) {
int64_t key = plan_key_2d(rows, cols, type);
auto it = plans_.find(key);
if (it != plans_.end()) return it->second;
// cuFFT uses row-major (C order) for 2D: first dim = rows, second = cols.
// Eigen matrices are column-major, so we pass (cols, rows) to cuFFT
// to get the correct 2D transform.
cufftHandle plan;
EIGEN_CUFFT_CHECK(cufftPlan2d(&plan, cols, rows, type));
EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
plans_[key] = plan;
return plan;
}
// Scale complex array on device using cuBLAS scal.
void scale_device(Complex* d_ptr, int n, Scalar alpha) { scale_complex(cublas_, d_ptr, n, alpha); }
// Scale real array on device using cuBLAS scal.
void scale_device_real(Scalar* d_ptr, int n, Scalar alpha) { scale_real(cublas_, d_ptr, n, alpha); }
// Type-dispatched cuBLAS scal wrappers (C++14 compatible).
static void scale_complex(cublasHandle_t h, std::complex<float>* p, int n, float a) {
EIGEN_CUBLAS_CHECK(cublasCsscal(h, n, &a, reinterpret_cast<cuComplex*>(p), 1));
}
static void scale_complex(cublasHandle_t h, std::complex<double>* p, int n, double a) {
EIGEN_CUBLAS_CHECK(cublasZdscal(h, n, &a, reinterpret_cast<cuDoubleComplex*>(p), 1));
}
static void scale_real(cublasHandle_t h, float* p, int n, float a) {
EIGEN_CUBLAS_CHECK(cublasSscal(h, n, &a, p, 1));
}
static void scale_real(cublasHandle_t h, double* p, int n, double a) {
EIGEN_CUBLAS_CHECK(cublasDscal(h, n, &a, p, 1));
}
};
} // namespace Eigen
#endif // EIGEN_GPU_FFT_H

View File

@@ -179,7 +179,10 @@ class GpuQR {
/** Solve A * X = B via QR: X = R^{-1} * Q^H * B (least-squares for m >= n).
* Uses ormqr (apply Q^H) + trsm (solve R), without forming Q explicitly.
* Requires m >= n (overdetermined or square). Underdetermined not supported. */
* Requires m >= n (overdetermined or square). Underdetermined not supported.
*
* TODO: Add device-side accessor for the R factor (and Q application) as
* DeviceMatrix, so users can chain GPU operations without host round-trips. */
template <typename Rhs>
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
sync_info();

View File

@@ -18,6 +18,7 @@
// GpuSVD<double> svd(A, ComputeThinU | ComputeThinV);
// VectorXd S = svd.singularValues();
// MatrixXd U = svd.matrixU(); // m×k or m×m
// MatrixXd V = svd.matrixV(); // n×k or n×n (matches JacobiSVD)
// MatrixXd VT = svd.matrixVT(); // k×n or n×n (this is V^T)
// MatrixXd X = svd.solve(B); // pseudoinverse
// MatrixXd X = svd.solve(B, k); // truncated (top k triplets)
@@ -53,6 +54,7 @@ class GpuSVD {
~GpuSVD() {
if (handle_) (void)cusolverDnDestroy(handle_);
if (cublas_lt_) (void)cublasLtDestroy(cublas_lt_);
if (cublas_) (void)cublasDestroy(cublas_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
@@ -143,6 +145,10 @@ class GpuSVD {
Index rows() const { return transposed_ ? n_ : m_; }
Index cols() const { return transposed_ ? m_ : n_; }
// TODO: Add device-side accessors (deviceU(), deviceVT(), deviceSingularValues())
// returning DeviceMatrix views of the internal buffers, so users can chain
// GPU operations without round-tripping through host memory.
/** Singular values (always available). Downloads from device on each call. */
RealVector singularValues() const {
sync_info();
@@ -181,6 +187,10 @@ class GpuSVD {
}
}
/** Right singular vectors V. Returns n_orig × k or n_orig × n_orig.
* Equivalent to matrixVT().adjoint(). Matches Eigen's JacobiSVD::matrixV() API. */
PlainMatrix matrixV() const { return matrixVT().adjoint(); }
/** Right singular vectors transposed V^T. Returns k × n_orig or n_orig × n_orig.
* For transposed case, VT comes from cuSOLVER's U. */
PlainMatrix matrixVT() const {
@@ -250,6 +260,8 @@ class GpuSVD {
cudaStream_t stream_ = nullptr;
cusolverDnHandle_t handle_ = nullptr;
cublasHandle_t cublas_ = nullptr;
cublasLtHandle_t cublas_lt_ = nullptr;
mutable internal::DeviceBuffer gemm_workspace_;
internal::CusolverParams params_;
internal::DeviceBuffer d_A_; // working copy of A (overwritten by gesvd)
internal::DeviceBuffer d_U_; // left singular vectors
@@ -273,6 +285,7 @@ class GpuSVD {
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
EIGEN_CUBLASLT_CHECK(cublasLtCreate(&cublas_lt_));
ensure_scratch(0);
}
@@ -406,23 +419,21 @@ class GpuSVD {
internal::DeviceBuffer d_tmp(static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar));
{
Scalar alpha_one(1), beta_zero(0);
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;
if (!transposed_) {
// U_stored^H * B: (m_×kk)^H × (m_×nrhs) → kk×nrhs.
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs),
static_cast<int>(m_), &alpha_one, d_U_.ptr, dtype, static_cast<int>(m_),
d_B.ptr, dtype, static_cast<int>(m_orig), &beta_zero, d_tmp.ptr, dtype,
static_cast<int>(kk), compute, internal::cuda_gemm_algo()));
internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_C, CUBLAS_OP_N, kk, nrhs, m_, &alpha_one,
static_cast<const Scalar*>(d_U_.ptr), m_, static_cast<const Scalar*>(d_B.ptr),
m_orig, &beta_zero, static_cast<Scalar*>(d_tmp.ptr), kk, &gemm_workspace_,
stream_);
} else {
// VT_stored * B: VT_stored is vtrows×n_ = kk×m_orig (thin), NoTrans.
// vtrows×m_orig times m_orig×nrhs → vtrows×nrhs. Use first kk rows.
const Index vtrows_stored = (swap_uv_options(options_) & ComputeFullV) ? n_ : k;
EIGEN_CUBLAS_CHECK(cublasGemmEx(
cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs), static_cast<int>(m_orig),
&alpha_one, d_VT_.ptr, dtype, static_cast<int>(vtrows_stored), d_B.ptr, dtype, static_cast<int>(m_orig),
&beta_zero, d_tmp.ptr, dtype, static_cast<int>(kk), compute, internal::cuda_gemm_algo()));
internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_N, CUBLAS_OP_N, kk, nrhs, m_orig, &alpha_one,
static_cast<const Scalar*>(d_VT_.ptr), vtrows_stored,
static_cast<const Scalar*>(d_B.ptr), m_orig, &beta_zero,
static_cast<Scalar*>(d_tmp.ptr), kk, &gemm_workspace_, stream_);
}
}
@@ -454,21 +465,19 @@ class GpuSVD {
{
internal::DeviceBuffer d_X(static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
Scalar alpha_one(1), beta_zero(0);
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;
if (!transposed_) {
const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(n_orig),
static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_VT_.ptr, dtype,
static_cast<int>(vtrows), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero,
d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo()));
internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_C, CUBLAS_OP_N, n_orig, nrhs, kk, &alpha_one,
static_cast<const Scalar*>(d_VT_.ptr), vtrows,
static_cast<const Scalar*>(d_tmp.ptr), kk, &beta_zero,
static_cast<Scalar*>(d_X.ptr), n_orig, &gemm_workspace_, stream_);
} else {
// U_stored is m_×ucols. V_orig = U_stored[:,:kk]. NoTrans × tmp.
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(n_orig),
static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_U_.ptr, dtype,
static_cast<int>(m_), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero,
d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo()));
internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_N, CUBLAS_OP_N, n_orig, nrhs, kk, &alpha_one,
static_cast<const Scalar*>(d_U_.ptr), m_, static_cast<const Scalar*>(d_tmp.ptr),
kk, &beta_zero, static_cast<Scalar*>(d_X.ptr), n_orig, &gemm_workspace_,
stream_);
}
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_X.ptr,

View File

@@ -0,0 +1,481 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix
// multiply (SpMM) via cuSPARSE.
//
// GpuSparseContext manages cuSPARSE descriptors and device buffers. It accepts
// Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the GPU.
// RowMajor input is implicitly converted to ColMajor.
//
// Can borrow a GpuContext for same-stream execution with BLAS-1 ops (zero
// event overhead in iterative solvers like CG).
//
// Usage:
// // Standalone (own stream):
// GpuSparseContext<double> ctx;
// VectorXd y = ctx.multiply(A, x);
//
// // Shared context (same stream as BLAS-1 ops):
// GpuContext gpu_ctx;
// GpuSparseContext<double> sparse_ctx(gpu_ctx);
// VectorXd y = sparse_ctx.multiply(A, x);
//
// // Device-resident (no host roundtrip):
// sparse_ctx.multiply(A, d_x, d_y); // DeviceMatrix in/out
#ifndef EIGEN_GPU_SPARSE_CONTEXT_H
#define EIGEN_GPU_SPARSE_CONTEXT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuSparseSupport.h"
namespace Eigen {
// Forward declarations.
template <typename Scalar_>
class GpuSparseContext;
template <typename Scalar_>
class DeviceSparseView;
/** SpMV expression: DeviceSparseView * DeviceMatrix → SpMVExpr.
* Evaluated by DeviceMatrix::operator=(SpMVExpr). */
template <typename Scalar_>
class SpMVExpr {
public:
using Scalar = Scalar_;
SpMVExpr(const DeviceSparseView<Scalar>& view, const DeviceMatrix<Scalar>& x) : view_(view), x_(x) {}
const DeviceSparseView<Scalar>& view() const { return view_; }
const DeviceMatrix<Scalar>& x() const { return x_; }
private:
const DeviceSparseView<Scalar>& view_;
const DeviceMatrix<Scalar>& x_;
};
/** Device-resident sparse matrix view. Returned by GpuSparseContext::deviceView().
* Lightweight handle referencing the context's cached device data.
*
* \warning One GpuSparseContext caches one sparse matrix at a time.
* Creating a second deviceView on the same context overwrites the first.
* For multiple simultaneous sparse matrices, use separate GpuSparseContext
* instances (they can share a GpuContext for same-stream execution).
*
* Supports `d_y = d_A * d_x` via SpMVExpr. */
template <typename Scalar_>
class DeviceSparseView {
public:
using Scalar = Scalar_;
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
DeviceSparseView(GpuSparseContext<Scalar>& ctx, const SpMat& A) : ctx_(ctx), A_(A) {}
/** SpMV expression: d_A * d_x. Evaluated by DeviceMatrix::operator=. */
SpMVExpr<Scalar> operator*(const DeviceMatrix<Scalar>& x) const { return SpMVExpr<Scalar>(*this, x); }
Index rows() const { return A_.rows(); }
Index cols() const { return A_.cols(); }
const GpuSparseContext<Scalar>& context() const { return ctx_; }
const SpMat& matrix() const { return A_; }
private:
GpuSparseContext<Scalar>& ctx_;
const SpMat& A_;
};
template <typename Scalar_>
class GpuSparseContext {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using StorageIndex = int;
using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
using DenseVector = Matrix<Scalar, Dynamic, 1>;
using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
/** Standalone: creates own stream and cuSPARSE handle. */
GpuSparseContext() : owns_handle_(true) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
owns_stream_ = true;
EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_));
EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_));
}
/** Borrow a GpuContext: shares stream and cuSPARSE handle.
* The GpuContext must outlive this GpuSparseContext. */
explicit GpuSparseContext(GpuContext& ctx)
: stream_(ctx.stream()), handle_(ctx.cusparseHandle()), owns_stream_(false), owns_handle_(false) {}
~GpuSparseContext() {
destroy_descriptors();
if (owns_handle_ && handle_) (void)cusparseDestroy(handle_);
if (owns_stream_ && stream_) (void)cudaStreamDestroy(stream_);
}
GpuSparseContext(const GpuSparseContext&) = delete;
GpuSparseContext& operator=(const GpuSparseContext&) = delete;
// ---- Device sparse view (for expression syntax: d_y = d_A * d_x) ----------
/** Upload a sparse matrix to device and return a lightweight view.
* The sparse data is uploaded immediately and cached in this context.
* The returned view can be used for repeated SpMV without re-uploading.
* If the matrix values change, call deviceView() again to re-upload.
*
* \warning One context caches one matrix. Calling deviceView() again
* overwrites the previous upload. For multiple simultaneous matrices,
* use separate GpuSparseContext instances sharing the same GpuContext.
*
* Supports `d_y = d_A * d_x` expression syntax. */
DeviceSparseView<Scalar> deviceView(const SpMat& A) {
eigen_assert(A.isCompressed());
upload_sparse(A);
return DeviceSparseView<Scalar>(*this, A);
}
// ---- SpMV: y = A * x (host vectors) --------------------------------------
/** Compute y = A * x. Returns y as a new dense vector. */
template <typename InputType, typename Rhs>
DenseVector multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
const SpMat mat(A.derived());
DenseVector y(mat.rows());
y.setZero();
multiply_host_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
return y;
}
/** Compute y = alpha * op(A) * x + beta * y (in-place, host vectors). */
template <typename InputType, typename Rhs, typename Dest>
void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y,
Scalar alpha = Scalar(1), Scalar beta = Scalar(0),
cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
const SpMat mat(A.derived());
multiply_host_impl(mat, x.derived(), y.derived(), alpha, beta, op);
}
// ---- SpMV: y = A * x (DeviceMatrix, no host roundtrip) -------------------
/** Compute d_y = A * d_x. Device-resident, no host transfer.
* Sparse matrix A is uploaded to device (cached). Dense vectors stay on device. */
template <typename InputType>
void multiply(const SparseMatrixBase<InputType>& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y) {
const SpMat mat(A.derived());
multiply_device_impl(mat, d_x, d_y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
}
/** Compute d_y = alpha * op(A) * d_x + beta * d_y (DeviceMatrix, in-place). */
template <typename InputType>
void multiply(const SparseMatrixBase<InputType>& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y,
Scalar alpha, Scalar beta, cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
const SpMat mat(A.derived());
multiply_device_impl(mat, d_x, d_y, alpha, beta, op);
}
// ---- SpMV transpose -------------------------------------------------------
/** Compute y = A^T * x (host vectors). */
template <typename InputType, typename Rhs>
DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
const SpMat mat(A.derived());
DenseVector y(mat.cols());
y.setZero();
multiply_host_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
return y;
}
// ---- SpMM: Y = A * X (host, multiple RHS) --------------------------------
/** Compute Y = A * X where X is a dense matrix. Returns Y. */
template <typename InputType, typename Rhs>
DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X) {
const SpMat mat(A.derived());
const DenseMatrix rhs(X.derived());
eigen_assert(mat.cols() == rhs.rows());
const Index m = mat.rows();
const Index n = rhs.cols();
if (m == 0 || n == 0 || mat.nonZeros() == 0) return DenseMatrix::Zero(m, n);
DenseMatrix Y = DenseMatrix::Zero(m, n);
spmm_impl(mat, rhs, Y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
return Y;
}
// ---- Accessors ------------------------------------------------------------
cudaStream_t stream() const { return stream_; }
private:
cudaStream_t stream_ = nullptr;
cusparseHandle_t handle_ = nullptr;
bool owns_stream_ = false;
bool owns_handle_ = false;
// Cached device buffers for sparse matrix (grow-only).
internal::DeviceBuffer d_outerPtr_;
internal::DeviceBuffer d_innerIdx_;
internal::DeviceBuffer d_values_;
size_t d_outerPtr_size_ = 0;
size_t d_innerIdx_size_ = 0;
size_t d_values_size_ = 0;
// Cached device buffers for host-API dense vectors (grow-only).
internal::DeviceBuffer d_x_;
internal::DeviceBuffer d_y_;
size_t d_x_size_ = 0;
size_t d_y_size_ = 0;
mutable internal::DeviceBuffer d_workspace_;
mutable size_t d_workspace_size_ = 0;
// Cached cuSPARSE sparse matrix descriptor.
cusparseSpMatDescr_t spmat_desc_ = nullptr;
Index cached_rows_ = -1;
Index cached_cols_ = -1;
Index cached_nnz_ = -1;
// ---- SpMV with host vectors (upload/download per call) --------------------
template <typename RhsDerived, typename DestDerived>
void multiply_host_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
cusparseOperation_t op) {
eigen_assert(A.isCompressed());
const Index m = A.rows();
const Index n = A.cols();
const Index nnz = A.nonZeros();
const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
eigen_assert(x.size() == x_size);
eigen_assert(y.size() == y_size);
if (m == 0 || n == 0 || nnz == 0) {
if (beta == Scalar(0))
y.setZero();
else
y *= beta;
return;
}
upload_sparse(A);
ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar));
const DenseVector x_tmp(x);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x_.ptr, x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar));
if (beta != Scalar(0)) {
const DenseVector y_tmp(y);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_y_.ptr, y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
}
exec_spmv(x_size, y_size, d_x_.ptr, d_y_.ptr, alpha, beta, op);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(y.data(), d_y_.ptr, y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
}
// ---- SpMV with DeviceMatrix (no host transfer) ----------------------------
// Called by public multiply(A, d_x, d_y) — always re-uploads A.
void multiply_device_impl(const SpMat& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y, Scalar alpha,
Scalar beta, cusparseOperation_t op) {
upload_sparse(A);
spmv_device_exec(d_x, d_y, alpha, beta, op);
}
public:
/** Execute SpMV using the already-uploaded sparse matrix (no re-upload).
* Used by SpMVExpr (d_y = d_A * d_x) for cached deviceView() paths.
* The sparse matrix must have been uploaded via deviceView() or multiply(). */
void spmv_device_exec(const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y, Scalar alpha = Scalar(1),
Scalar beta = Scalar(0), cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) const {
eigen_assert(spmat_desc_ && "sparse matrix not uploaded — call deviceView() or multiply() first");
// cuSPARSE SpMV: y must not alias x (undefined behavior).
eigen_assert(d_x.data() != d_y.data() && "SpMV: output aliases input vector");
const Index m = cached_rows_;
const Index n = cached_cols_;
const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
eigen_assert(d_x.rows() * d_x.cols() == x_size);
if (m == 0 || n == 0 || cached_nnz_ == 0) {
d_y.resize(y_size, 1);
if (beta == Scalar(0)) {
d_y.setZero();
}
return;
}
// Ensure d_y is allocated.
if (d_y.rows() * d_y.cols() != y_size) {
d_y.resize(y_size, 1);
}
// Wait for input data to be ready on this stream.
d_x.waitReady(stream_);
d_y.waitReady(stream_);
exec_spmv(x_size, y_size, const_cast<void*>(static_cast<const void*>(d_x.data())), static_cast<void*>(d_y.data()),
alpha, beta, op);
d_y.recordReady(stream_);
}
private:
// ---- Shared SpMV execution ------------------------------------------------
void exec_spmv(Index x_size, Index y_size, void* d_x_ptr, void* d_y_ptr, Scalar alpha, Scalar beta,
cusparseOperation_t op) const {
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr;
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_ptr, dtype));
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_ptr, dtype));
size_t ws_size = 0;
EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
CUSPARSE_SPMV_ALG_DEFAULT, &ws_size));
ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.ptr));
(void)cusparseDestroyDnVec(x_desc);
(void)cusparseDestroyDnVec(y_desc);
}
// ---- SpMM implementation --------------------------------------------------
void spmm_impl(const SpMat& A, const DenseMatrix& X, DenseMatrix& Y, Scalar alpha, Scalar beta,
cusparseOperation_t op) {
eigen_assert(A.isCompressed());
const Index m = A.rows();
const Index n = X.cols();
const Index k = A.cols();
const Index nnz = A.nonZeros();
if (m == 0 || n == 0 || k == 0 || nnz == 0) {
if (beta == Scalar(0))
Y.setZero();
else
Y *= beta;
return;
}
upload_sparse(A);
const size_t x_bytes = static_cast<size_t>(k) * static_cast<size_t>(n) * sizeof(Scalar);
const size_t y_bytes = static_cast<size_t>(m) * static_cast<size_t>(n) * sizeof(Scalar);
ensure_buffer(d_x_, d_x_size_, x_bytes);
ensure_buffer(d_y_, d_y_size_, y_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_x_.ptr, X.data(), x_bytes, cudaMemcpyHostToDevice, stream_));
if (beta != Scalar(0)) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.ptr, Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_));
}
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr;
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k, n, k, d_x_.ptr, dtype, CUSPARSE_ORDER_COL));
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m, n, m, d_y_.ptr, dtype, CUSPARSE_ORDER_COL));
size_t ws_size = 0;
EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_,
x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size));
ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta,
y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.ptr));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.ptr, y_bytes, cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
(void)cusparseDestroyDnMat(x_desc);
(void)cusparseDestroyDnMat(y_desc);
}
// ---- Helpers --------------------------------------------------------------
void upload_sparse(const SpMat& A) {
const Index m = A.rows();
const Index n = A.cols();
const Index nnz = A.nonZeros();
const size_t outer_bytes = static_cast<size_t>(n + 1) * sizeof(StorageIndex);
const size_t inner_bytes = static_cast<size_t>(nnz) * sizeof(StorageIndex);
const size_t val_bytes = static_cast<size_t>(nnz) * sizeof(Scalar);
ensure_buffer(d_outerPtr_, d_outerPtr_size_, outer_bytes);
ensure_buffer(d_innerIdx_, d_innerIdx_size_, inner_bytes);
ensure_buffer(d_values_, d_values_size_, val_bytes);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_outerPtr_.ptr, A.outerIndexPtr(), outer_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_innerIdx_.ptr, A.innerIndexPtr(), inner_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, A.valuePtr(), val_bytes, cudaMemcpyHostToDevice, stream_));
if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) {
destroy_descriptors();
constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value;
EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr,
idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO, val_type));
cached_rows_ = m;
cached_cols_ = n;
cached_nnz_ = nnz;
} else {
EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr));
}
}
void destroy_descriptors() {
if (spmat_desc_) {
(void)cusparseDestroySpMat(spmat_desc_);
spmat_desc_ = nullptr;
}
cached_rows_ = -1;
cached_cols_ = -1;
cached_nnz_ = -1;
}
void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) const {
if (needed > current_size) {
if (buf.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
buf = internal::DeviceBuffer(needed);
current_size = needed;
}
}
};
// ---- DeviceMatrix::operator=(SpMVExpr) out-of-line definition ----------------
// Defined here because it needs the full GpuSparseContext definition.
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const SpMVExpr<Scalar_>& expr) {
// Use spmv_device_exec — the sparse matrix was already uploaded by deviceView().
// No re-upload on repeated SpMV with the same view.
expr.view().context().spmv_device_exec(expr.x(), *this, Scalar_(1), Scalar_(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
return *this;
}
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_CONTEXT_H

View File

@@ -0,0 +1,62 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU sparse LDL^T / LDL^H factorization via cuDSS.
//
// For symmetric indefinite (or Hermitian indefinite) sparse matrices.
// Same three-phase workflow as GpuSparseLLT.
//
// Usage:
// GpuSparseLDLT<double> ldlt(A); // analyze + factorize
// VectorXd x = ldlt.solve(b); // solve
#ifndef EIGEN_GPU_SPARSE_LDLT_H
#define EIGEN_GPU_SPARSE_LDLT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSparseSolverBase.h"
namespace Eigen {
/** GPU sparse LDL^T factorization (symmetric indefinite / Hermitian indefinite).
*
* Wraps cuDSS with CUDSS_MTYPE_SYMMETRIC (real) or CUDSS_MTYPE_HERMITIAN (complex).
* Uses pivoting for numerical stability.
*
* \tparam Scalar_ float, double, complex<float>, or complex<double>
* \tparam UpLo_ Lower (default) or Upper — which triangle of A is stored
*/
template <typename Scalar_, int UpLo_ = Lower>
class GpuSparseLDLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT<Scalar_, UpLo_>> {
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT>;
friend Base;
public:
using Scalar = Scalar_;
enum { UpLo = UpLo_ };
GpuSparseLDLT() = default;
template <typename InputType>
explicit GpuSparseLDLT(const SparseMatrixBase<InputType>& A) {
this->compute(A);
}
static constexpr bool needs_csr_conversion() { return false; }
static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_symmetric_type<Scalar>::value; }
static constexpr cudssMatrixViewType_t cudss_matrix_view() {
return internal::cudss_view_type<UpLo, ColMajor>::value;
}
};
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_LDLT_H

View File

@@ -0,0 +1,62 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU sparse Cholesky (LL^T / LL^H) via cuDSS.
//
// Usage:
// GpuSparseLLT<double> llt(A); // analyze + factorize
// VectorXd x = llt.solve(b); // solve
// llt.analyzePattern(A); // or separate phases
// llt.factorize(A_new); // reuse symbolic analysis
#ifndef EIGEN_GPU_SPARSE_LLT_H
#define EIGEN_GPU_SPARSE_LLT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSparseSolverBase.h"
namespace Eigen {
/** GPU sparse Cholesky factorization (LL^T for real, LL^H for complex).
*
* Wraps cuDSS with CUDSS_MTYPE_SPD (real) or CUDSS_MTYPE_HPD (complex).
* Accepts ColMajor SparseMatrix (CSC), reinterpreted as CSR with swapped
* triangle view for zero-copy upload.
*
* \tparam Scalar_ float, double, complex<float>, or complex<double>
* \tparam UpLo_ Lower (default) or Upper — which triangle of A is stored
*/
template <typename Scalar_, int UpLo_ = Lower>
class GpuSparseLLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT<Scalar_, UpLo_>> {
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT>;
friend Base;
public:
using Scalar = Scalar_;
enum { UpLo = UpLo_ };
GpuSparseLLT() = default;
template <typename InputType>
explicit GpuSparseLLT(const SparseMatrixBase<InputType>& A) {
this->compute(A);
}
static constexpr bool needs_csr_conversion() { return false; }
static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_spd_type<Scalar>::value; }
static constexpr cudssMatrixViewType_t cudss_matrix_view() {
return internal::cudss_view_type<UpLo, ColMajor>::value;
}
};
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_LLT_H

View File

@@ -0,0 +1,59 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU sparse LU factorization via cuDSS.
//
// For general (non-symmetric) sparse matrices. Uses pivoting.
// Same three-phase workflow as GpuSparseLLT.
//
// Usage:
// GpuSparseLU<double> lu(A); // analyze + factorize
// VectorXd x = lu.solve(b); // solve
#ifndef EIGEN_GPU_SPARSE_LU_H
#define EIGEN_GPU_SPARSE_LU_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSparseSolverBase.h"
namespace Eigen {
/** GPU sparse LU factorization (general matrices).
*
* Wraps cuDSS with CUDSS_MTYPE_GENERAL and CUDSS_MVIEW_FULL.
* Accepts ColMajor SparseMatrix (CSC); internally converts to RowMajor
* CSR since cuDSS requires CSR input.
*
* \tparam Scalar_ float, double, complex<float>, or complex<double>
*/
template <typename Scalar_>
class GpuSparseLU : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLU<Scalar_>> {
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLU>;
friend Base;
public:
using Scalar = Scalar_;
GpuSparseLU() = default;
template <typename InputType>
explicit GpuSparseLU(const SparseMatrixBase<InputType>& A) {
this->compute(A);
}
static constexpr bool needs_csr_conversion() { return true; }
static constexpr cudssMatrixType_t cudss_matrix_type() { return CUDSS_MTYPE_GENERAL; }
static constexpr cudssMatrixViewType_t cudss_matrix_view() { return CUDSS_MVIEW_FULL; }
};
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_LU_H

View File

@@ -0,0 +1,356 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Common base for GPU sparse direct solvers (LLT, LDLT, LU) via cuDSS.
//
// All three solver types share the same three-phase workflow
// (analyzePattern → factorize → solve) and differ only in the
// cudssMatrixType_t and cudssMatrixViewType_t passed to cuDSS.
// This CRTP base implements the entire workflow; derived classes
// provide the matrix type/view via static constexpr members.
#ifndef EIGEN_GPU_SPARSE_SOLVER_BASE_H
#define EIGEN_GPU_SPARSE_SOLVER_BASE_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuDssSupport.h"
namespace Eigen {
namespace internal {
/** CRTP base for GPU sparse direct solvers.
*
* \tparam Scalar_ Element type (passed explicitly to avoid incomplete-type issues with CRTP).
* \tparam Derived The concrete solver class (GpuSparseLLT, GpuSparseLDLT, GpuSparseLU).
* Must provide:
* - `static constexpr cudssMatrixType_t cudss_matrix_type()`
* - `static constexpr cudssMatrixViewType_t cudss_matrix_view()`
*/
template <typename Scalar_, typename Derived>
class GpuSparseSolverBase {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using StorageIndex = int;
using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>;
using DenseVector = Matrix<Scalar, Dynamic, 1>;
using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
GpuSparseSolverBase() { init_context(); }
~GpuSparseSolverBase() {
destroy_cudss_objects();
if (handle_) (void)cudssDestroy(handle_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
GpuSparseSolverBase(const GpuSparseSolverBase&) = delete;
GpuSparseSolverBase& operator=(const GpuSparseSolverBase&) = delete;
// ---- Configuration --------------------------------------------------------
/** Set the fill-reducing ordering algorithm. Must be called before compute/analyzePattern. */
void setOrdering(GpuSparseOrdering ordering) { ordering_ = ordering; }
// ---- Factorization --------------------------------------------------------
/** Symbolic analysis + numeric factorization. */
template <typename InputType>
Derived& compute(const SparseMatrixBase<InputType>& A) {
analyzePattern(A);
if (info_ == Success) {
factorize(A);
}
return derived();
}
/** Symbolic analysis only. Uploads sparsity structure to device.
* This phase is synchronous (blocks until complete). */
template <typename InputType>
Derived& analyzePattern(const SparseMatrixBase<InputType>& A) {
const SpMat csc(A.derived());
eigen_assert(csc.rows() == csc.cols() && "GpuSparseSolver requires a square matrix");
eigen_assert(csc.isCompressed() && "GpuSparseSolver requires a compressed sparse matrix");
n_ = csc.rows();
info_ = InvalidInput;
analysis_done_ = false;
if (n_ == 0) {
nnz_ = 0;
info_ = Success;
analysis_done_ = true;
return derived();
}
// For symmetric solvers, ColMajor CSC can be reinterpreted as CSR with
// swapped triangle view (zero copy). For general solvers, we must convert
// to actual RowMajor CSR so cuDSS sees the correct matrix, not A^T.
if (Derived::needs_csr_conversion()) {
const CsrMat csr(csc);
nnz_ = csr.nonZeros();
upload_csr(csr);
} else {
nnz_ = csc.nonZeros();
upload_csr_from_csc(csc);
}
create_cudss_matrix();
apply_ordering_config();
if (data_) EIGEN_CUDSS_CHECK(cudssDataDestroy(handle_, data_));
EIGEN_CUDSS_CHECK(cudssDataCreate(handle_, &data_));
create_placeholder_dense();
EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_ANALYSIS, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
analysis_done_ = true;
info_ = Success;
return derived();
}
/** Numeric factorization using the symbolic analysis from analyzePattern.
*
* \warning The sparsity pattern (outerIndexPtr, innerIndexPtr) must be
* identical to the one passed to analyzePattern(). Only the numerical
* values may change. Passing a different pattern is undefined behavior.
* This matches the contract of CHOLMOD, UMFPACK, and cuDSS's own API.
*
* This phase is asynchronous — info() lazily synchronizes. */
template <typename InputType>
Derived& factorize(const SparseMatrixBase<InputType>& A) {
eigen_assert(analysis_done_ && "factorize() requires analyzePattern() first");
if (n_ == 0) {
info_ = Success;
return derived();
}
// Convert to the same format used in analyzePattern.
// Both temporaries must outlive the async memcpy (pageable H2D is actually
// synchronous w.r.t. the host, but keep them alive for clarity).
const SpMat csc(A.derived());
eigen_assert(csc.rows() == n_ && csc.cols() == n_);
const Scalar* value_ptr;
Index value_nnz;
CsrMat csr_tmp;
if (Derived::needs_csr_conversion()) {
csr_tmp = CsrMat(csc);
value_ptr = csr_tmp.valuePtr();
value_nnz = csr_tmp.nonZeros();
} else {
value_ptr = csc.valuePtr();
value_nnz = csc.nonZeros();
}
eigen_assert(value_nnz == nnz_);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, value_ptr, static_cast<size_t>(nnz_) * sizeof(Scalar),
cudaMemcpyHostToDevice, stream_));
EIGEN_CUDSS_CHECK(cudssMatrixSetValues(d_A_cudss_, d_values_.ptr));
info_ = InvalidInput;
info_synced_ = false;
EIGEN_CUDSS_CHECK(
cudssExecute(handle_, CUDSS_PHASE_FACTORIZATION, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
return derived();
}
// ---- Solve ----------------------------------------------------------------
/** Solve A * X = B. Returns X as a dense matrix.
* Supports single or multiple right-hand sides. */
template <typename Rhs>
DenseMatrix solve(const MatrixBase<Rhs>& B) const {
sync_info();
eigen_assert(info_ == Success && "GpuSparseSolver::solve requires a successful factorization");
eigen_assert(B.rows() == n_);
const DenseMatrix rhs(B);
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
if (n_ == 0) return DenseMatrix(0, rhs.cols());
const size_t rhs_bytes = static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar);
DeviceBuffer d_b(rhs_bytes);
DeviceBuffer d_x(rhs_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_b.ptr, rhs.data(), rhs_bytes, cudaMemcpyHostToDevice, stream_));
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
cudssMatrix_t b_cudss = nullptr, x_cudss = nullptr;
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&b_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_b.ptr,
dtype, CUDSS_LAYOUT_COL_MAJOR));
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&x_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_x.ptr,
dtype, CUDSS_LAYOUT_COL_MAJOR));
EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_SOLVE, config_, data_, d_A_cudss_, x_cudss, b_cudss));
DenseMatrix X(n_, rhs.cols());
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_x.ptr, rhs_bytes, cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
(void)cudssMatrixDestroy(b_cudss);
(void)cudssMatrixDestroy(x_cudss);
return X;
}
// ---- Accessors ------------------------------------------------------------
ComputationInfo info() const {
sync_info();
return info_;
}
Index rows() const { return n_; }
Index cols() const { return n_; }
cudaStream_t stream() const { return stream_; }
protected:
// ---- CUDA / cuDSS handles -------------------------------------------------
cudaStream_t stream_ = nullptr;
cudssHandle_t handle_ = nullptr;
cudssConfig_t config_ = nullptr;
cudssData_t data_ = nullptr;
cudssMatrix_t d_A_cudss_ = nullptr;
cudssMatrix_t d_x_cudss_ = nullptr;
cudssMatrix_t d_b_cudss_ = nullptr;
// ---- Device buffers for CSR arrays ----------------------------------------
DeviceBuffer d_rowPtr_;
DeviceBuffer d_colIdx_;
DeviceBuffer d_values_;
// ---- State ----------------------------------------------------------------
Index n_ = 0;
Index nnz_ = 0;
ComputationInfo info_ = InvalidInput;
bool info_synced_ = true;
bool analysis_done_ = false;
GpuSparseOrdering ordering_ = GpuSparseOrdering::AMD;
private:
Derived& derived() { return static_cast<Derived&>(*this); }
const Derived& derived() const { return static_cast<const Derived&>(*this); }
void init_context() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUDSS_CHECK(cudssCreate(&handle_));
EIGEN_CUDSS_CHECK(cudssSetStream(handle_, stream_));
EIGEN_CUDSS_CHECK(cudssConfigCreate(&config_));
}
void sync_info() const {
if (!info_synced_) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
int cudss_info = 0;
EIGEN_CUDSS_CHECK(cudssDataGet(handle_, data_, CUDSS_DATA_INFO, &cudss_info, sizeof(cudss_info), nullptr));
auto* self = const_cast<GpuSparseSolverBase*>(this);
self->info_ = (cudss_info == 0) ? Success : NumericalIssue;
self->info_synced_ = true;
}
}
void destroy_cudss_objects() {
if (d_A_cudss_) {
(void)cudssMatrixDestroy(d_A_cudss_);
d_A_cudss_ = nullptr;
}
if (d_x_cudss_) {
(void)cudssMatrixDestroy(d_x_cudss_);
d_x_cudss_ = nullptr;
}
if (d_b_cudss_) {
(void)cudssMatrixDestroy(d_b_cudss_);
d_b_cudss_ = nullptr;
}
if (data_) {
(void)cudssDataDestroy(handle_, data_);
data_ = nullptr;
}
if (config_) {
(void)cudssConfigDestroy(config_);
config_ = nullptr;
}
}
// Upload CSR from a RowMajor sparse matrix (native CSR).
void upload_csr(const CsrMat& csr) { upload_compressed(csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr()); }
// Upload CSC arrays reinterpreted as CSR (for symmetric matrices: CSC(A) = CSR(A^T) = CSR(A)).
void upload_csr_from_csc(const SpMat& csc) {
upload_compressed(csc.outerIndexPtr(), csc.innerIndexPtr(), csc.valuePtr());
}
void upload_compressed(const StorageIndex* outer, const StorageIndex* inner, const Scalar* values) {
const size_t rowptr_bytes = static_cast<size_t>(n_ + 1) * sizeof(StorageIndex);
const size_t colidx_bytes = static_cast<size_t>(nnz_) * sizeof(StorageIndex);
const size_t values_bytes = static_cast<size_t>(nnz_) * sizeof(Scalar);
d_rowPtr_ = DeviceBuffer(rowptr_bytes);
d_colIdx_ = DeviceBuffer(colidx_bytes);
d_values_ = DeviceBuffer(values_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_rowPtr_.ptr, outer, rowptr_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_colIdx_.ptr, inner, colidx_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, values, values_bytes, cudaMemcpyHostToDevice, stream_));
}
void create_cudss_matrix() {
if (d_A_cudss_) (void)cudssMatrixDestroy(d_A_cudss_);
constexpr cudaDataType_t idx_type = cudss_index_type<StorageIndex>::value;
constexpr cudaDataType_t val_type = cuda_data_type<Scalar>::value;
constexpr cudssMatrixType_t mtype = Derived::cudss_matrix_type();
constexpr cudssMatrixViewType_t mview = Derived::cudss_matrix_view();
EIGEN_CUDSS_CHECK(cudssMatrixCreateCsr(
&d_A_cudss_, static_cast<int64_t>(n_), static_cast<int64_t>(n_), static_cast<int64_t>(nnz_), d_rowPtr_.ptr,
/*rowEnd=*/nullptr, d_colIdx_.ptr, d_values_.ptr, idx_type, val_type, mtype, mview, CUDSS_BASE_ZERO));
}
void apply_ordering_config() {
cudssAlgType_t alg;
switch (ordering_) {
case GpuSparseOrdering::AMD:
alg = CUDSS_ALG_DEFAULT;
break;
case GpuSparseOrdering::METIS:
alg = CUDSS_ALG_2;
break;
case GpuSparseOrdering::RCM:
alg = CUDSS_ALG_3;
break;
default:
alg = CUDSS_ALG_DEFAULT;
break;
}
EIGEN_CUDSS_CHECK(cudssConfigSet(config_, CUDSS_CONFIG_REORDERING_ALG, &alg, sizeof(alg)));
}
void create_placeholder_dense() {
if (d_x_cudss_) (void)cudssMatrixDestroy(d_x_cudss_);
if (d_b_cudss_) (void)cudssMatrixDestroy(d_b_cudss_);
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_x_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
dtype, CUDSS_LAYOUT_COL_MAJOR));
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_b_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
dtype, CUDSS_LAYOUT_COL_MAJOR));
}
};
} // namespace internal
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_SOLVER_BASE_H

View File

@@ -21,6 +21,7 @@
#include "./InternalHeaderCheck.h"
#include <cuda_runtime.h>
#include <vector>
namespace Eigen {
namespace internal {
@@ -36,26 +37,99 @@ namespace internal {
// ---- RAII: device buffer ----------------------------------------------------
// Thread-local pool of small device buffers to avoid cudaMalloc/cudaFree
// overhead for tiny allocations (e.g., DeviceScalar). Buffers up to
// kSmallBufferThreshold bytes are recycled; larger allocations bypass the pool.
template <size_t SmallBufferThreshold = 256, size_t MaxPoolSize = 64>
struct DeviceBufferPool {
static constexpr size_t kSmallBufferThreshold = SmallBufferThreshold;
static constexpr size_t kMaxPoolSize = MaxPoolSize;
struct Entry {
void* ptr;
size_t bytes;
};
~DeviceBufferPool() {
for (auto& e : free_list_) (void)cudaFree(e.ptr);
}
void* allocate(size_t bytes) {
// Search for a buffer of sufficient size.
for (size_t i = 0; i < free_list_.size(); ++i) {
if (free_list_[i].bytes >= bytes) {
void* p = free_list_[i].ptr;
free_list_[i] = free_list_.back();
free_list_.pop_back();
return p;
}
}
// No suitable buffer found — allocate new.
void* p = nullptr;
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
return p;
}
void deallocate(void* p, size_t bytes) {
if (free_list_.size() < kMaxPoolSize) {
free_list_.push_back({p, bytes});
} else {
(void)cudaFree(p);
}
}
static DeviceBufferPool& threadLocal() {
thread_local DeviceBufferPool pool;
return pool;
}
private:
std::vector<Entry> free_list_;
};
struct DeviceBuffer {
void* ptr = nullptr;
DeviceBuffer() = default;
explicit DeviceBuffer(size_t bytes) {
if (bytes > 0) EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes));
explicit DeviceBuffer(size_t bytes) : size_(bytes) {
if (bytes > 0) {
if (bytes <= DeviceBufferPool<>::kSmallBufferThreshold) {
ptr = DeviceBufferPool<>::threadLocal().allocate(bytes);
} else {
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes));
}
}
}
~DeviceBuffer() {
if (ptr) (void)cudaFree(ptr); // destructor: ignore errors
if (ptr) {
if (size_ <= DeviceBufferPool<>::kSmallBufferThreshold) {
DeviceBufferPool<>::threadLocal().deallocate(ptr, size_);
} else {
(void)cudaFree(ptr);
}
}
}
// Move-only.
DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr) { o.ptr = nullptr; }
DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr), size_(o.size_) {
o.ptr = nullptr;
o.size_ = 0;
}
DeviceBuffer& operator=(DeviceBuffer&& o) noexcept {
if (this != &o) {
if (ptr) (void)cudaFree(ptr);
if (ptr) {
if (size_ <= DeviceBufferPool<>::kSmallBufferThreshold) {
DeviceBufferPool<>::threadLocal().deallocate(ptr, size_);
} else {
(void)cudaFree(ptr);
}
}
ptr = o.ptr;
size_ = o.size_;
o.ptr = nullptr;
o.size_ = 0;
}
return *this;
}
@@ -63,12 +137,19 @@ struct DeviceBuffer {
DeviceBuffer(const DeviceBuffer&) = delete;
DeviceBuffer& operator=(const DeviceBuffer&) = delete;
size_t size() const { return size_; }
// Adopt an existing device pointer. Caller relinquishes ownership.
// Adopted buffers bypass the pool on destruction.
static DeviceBuffer adopt(void* p) {
DeviceBuffer b;
b.ptr = p;
b.size_ = DeviceBufferPool<>::kSmallBufferThreshold + 1; // force cudaFree
return b;
}
private:
size_t size_ = 0;
};
// ---- Scalar → cudaDataType_t ------------------------------------------------

View File

@@ -1,8 +1,8 @@
# Eigen GPU Module (`Eigen/GPU`)
GPU-accelerated dense linear algebra for Eigen users, dispatching to NVIDIA
CUDA libraries (cuBLAS, cuSOLVER). Requires CUDA 11.4+. Header-only (link
against CUDA runtime, cuBLAS, and cuSOLVER).
GPU-accelerated linear algebra for Eigen users, dispatching to NVIDIA CUDA
libraries (cuBLAS, cuSOLVER, cuFFT, cuSPARSE, cuDSS). Requires CUDA 11.4+;
cuDSS features require CUDA 12.0+ and a separate cuDSS install. Header-only.
## Why this module
@@ -10,25 +10,31 @@ Eigen is the linear algebra foundation for a large ecosystem of C++ projects
in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP,
Open3D), scientific computing (Ceres, Stan), and beyond. Many of these
projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations
without dropping down to raw CUDA library APIs. Third-party projects like
[EigenCuda](https://github.com/NLESC-JCER/EigenCuda) and
[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically to fill
this gap, and downstream projects like
[Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
GPU-accelerated solvers through Eigen.
without dropping down to raw CUDA library APIs.
The `Eigen/GPU` module aims to close this gap: Existing Eigen users should be
able to move performance-critical dense linear algebra to the GPU with minimal
code changes and without learning CUDA library APIs directly.
GPU sparse solvers are a particularly acute gap. Sparse factorization is the
bottleneck in SLAM, bundle adjustment, FEM, and nonlinear optimization --
exactly the workloads where GPU acceleration matters most. Downstream projects
like [Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
GPU-accelerated sparse solvers, and third-party projects like
[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically because
Eigen lacks them. The `Eigen/GPU` module provides GPU sparse Cholesky, LDL^T,
and LU factorization via cuDSS, alongside dense solvers (cuSOLVER), matrix
products (cuBLAS), FFT (cuFFT), and sparse matrix-vector products (cuSPARSE).
Existing Eigen users should be able to move performance-critical dense or
sparse linear algebra to the GPU with minimal code changes and without
learning CUDA library APIs directly.
## Design philosophy
**CPU and GPU coexist.** There is no global compile-time switch that replaces
CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers
explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>` -- and both coexist in
the same binary. This also lets users keep the factored matrix on device across
multiple solves, something impossible with compile-time replacement.
explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>`, `GpuSparseLLT<double>` vs
`SimplicialLLT<SparseMatrix<double>>` -- and both coexist in the same binary.
This also lets users keep the factored matrix on device across multiple solves,
something impossible with compile-time replacement.
**Familiar syntax.** GPU operations use the same expression patterns as CPU
Eigen. Here is a side-by-side comparison:
@@ -38,6 +44,7 @@ Eigen. Here is a side-by-side comparison:
#include <Eigen/Dense> #define EIGEN_USE_GPU
#include <Eigen/GPU>
// Dense
MatrixXd A = ...; auto d_A = DeviceMatrix<double>::fromHost(A);
MatrixXd B = ...; auto d_B = DeviceMatrix<double>::fromHost(B);
@@ -45,11 +52,32 @@ MatrixXd C = A * B; DeviceMatrix<double> d_C = d_A * d_B;
MatrixXd X = A.llt().solve(B); DeviceMatrix<double> d_X = d_A.llt().solve(d_B);
MatrixXd X = d_X.toHost();
// Sparse (using SpMat = SparseMatrix<double>)
SimplicialLLT<SpMat> llt(A); GpuSparseLLT<double> llt(A);
VectorXd x = llt.solve(b); VectorXd x = llt.solve(b);
```
The GPU version reads like CPU Eigen with explicit upload/download.
`operator*` dispatches to cuBLAS GEMM, `.llt().solve()` dispatches to
cuSOLVER potrf + potrs. Unsupported expressions are compile errors.
The GPU version reads like CPU Eigen with explicit upload/download for dense
operations, and an almost identical API for sparse solvers. Unsupported
expressions are compile errors.
**Standalone module.** `Eigen/GPU` does not modify or depend on Eigen's Core
expression template system (`MatrixBase`, `CwiseBinaryOp`, etc.).
`DeviceMatrix` is not an Eigen expression type and does not inherit from
`MatrixBase`. The expression layer is a thin compile-time dispatch where every
supported expression maps to a single NVIDIA library call. There is no
coefficient-level evaluation, lazy fusion, or packet operations.
**Interoperability where useful.** `DeviceMatrix` provides the same operator
signatures as `Matrix` for common vector operations: `+=`, `-=`, `*=`,
`dot()`, `squaredNorm()`, `norm()`, `setZero()`, and `noalias()`. This makes
`DeviceMatrix` usable as a drop-in `VectorType` in Eigen algorithm templates
that rely on these operations. For example, Eigen's `conjugate_gradient()`
template works with `DeviceMatrix` with a single typedef change -- no
modifications to the algorithm or the expression template system. Conjugate
gradient is just the motivating example; we are open to expanding operator
coverage as needed to support other high-level Eigen algorithms on the GPU.
**Explicit over implicit.** Host-device transfers, stream management, and
library handle lifetimes are visible in the API. There are no hidden
@@ -85,6 +113,27 @@ MatrixXd C = transfer.get();
`selfadjointView<UpLo>()`, `llt()`, `lu()`. These return lightweight
expression objects that are evaluated when assigned.
For BLAS Level-1 operations, `DeviceMatrix` also provides `dot()`, `norm()`,
`squaredNorm()`, `setZero()`, `noalias()`, and arithmetic operators
(`+=`, `-=`, `*=`) that dispatch to cuBLAS `axpy`, `nrm2`, `dot`, and
`geam`. These are the operations needed by iterative solvers.
### `DeviceScalar<Scalar>`
A device-resident scalar value. Reductions like `dot()`, `norm()`, and
`squaredNorm()` return `DeviceScalar` instead of a host scalar, deferring
the host synchronization until the value is actually needed:
```cpp
auto dot_val = d_x.dot(d_y); // DeviceScalar -- no sync
auto norm_sq = d_r.squaredNorm(); // DeviceScalar -- no sync
Scalar alpha = dot_val / norm_sq; // sync here (implicit conversion)
d_x += alpha * d_p; // host scalar * DeviceMatrix (axpy)
```
Division between `DeviceScalar` values (real types only) is performed on
device via NPP, avoiding extra synchronizations.
### `GpuContext`
Every GPU operation needs a CUDA stream and library handles (cuBLAS,
@@ -107,6 +156,12 @@ d_C1.device(ctx1) = d_A1 * d_B1; // runs on stream 1
d_C2.device(ctx2) = d_A2 * d_B2; // runs on stream 2 (concurrently)
```
To integrate with existing CUDA code, borrow an existing stream:
```cpp
GpuContext ctx(my_existing_stream); // wraps stream, does not take ownership
```
## Usage
### Matrix operations (cuBLAS)
@@ -122,7 +177,7 @@ d_C = d_A * d_B.transpose();
// Scaled and accumulated
d_C += 2.0 * d_A * d_B; // alpha=2, beta=1
d_C.device(ctx) -= d_A * d_B; // alpha=-1, beta=1 (requires explicit context)
d_C.device(ctx) -= d_A * d_B; // alpha=-1, beta=1 (GEMM requires explicit context for -=)
// Triangular solve (TRSM)
d_X = d_A.triangularView<Lower>().solve(d_B);
@@ -134,6 +189,30 @@ d_C = d_A.selfadjointView<Lower>() * d_B;
d_C.selfadjointView<Lower>().rankUpdate(d_A); // C += A * A^H
```
### BLAS Level-1 operations
```cpp
// Dot product and norms (return DeviceScalar -- no sync until read)
auto dot_val = d_x.dot(d_y); // cublasDdot / cublasCdotc
auto norm_val = d_r.norm(); // cublasDnrm2
double n = norm_val; // implicit conversion triggers sync
// Vector arithmetic (cuBLAS axpy / geam)
d_x += alpha * d_p; // axpy: x = x + alpha * p
d_x -= alpha * d_p; // axpy: x = x - alpha * p
d_x *= alpha; // scal: x = alpha * x
d_r.setZero(); // cudaMemsetAsync
// DeviceScalar arithmetic (stays on device, real types only)
auto alpha = absNew / dot_val; // device-side division via NPP
d_x += alpha * d_p; // DeviceScalar * DeviceMatrix (axpy with device pointer)
// Matrix add/subtract (cuBLAS geam)
DeviceMatrix<double> d_C = d_A + d_B; // C = A + B
d_C = d_A + 2.0 * d_B; // C = A + 2*B
d_C = d_A - d_B; // C = A - B
```
### Dense solvers (cuSOLVER)
**One-shot expression syntax** -- Convenient, re-factorizes each time:
@@ -162,31 +241,141 @@ lu.compute(d_A);
auto d_Y = lu.solve(d_B, GpuLU<double>::Transpose); // A^T Y = B
// QR solve (overdetermined least squares)
GpuQR<double> qr(A); // host matrix input
MatrixXd X = qr.solve(B); // Q^H * B via ormqr, then trsm on R
GpuQR<double> qr;
qr.compute(d_A); // factorize on device (async)
auto d_X = qr.solve(d_B); // Q^H * B via ormqr, then trsm on R
MatrixXd X = d_X.toHost();
// SVD
GpuSVD<double> svd(A, ComputeThinU | ComputeThinV);
VectorXd S = svd.singularValues();
MatrixXd U = svd.matrixU();
MatrixXd VT = svd.matrixVT();
MatrixXd X = svd.solve(B); // pseudoinverse solve
// SVD (results downloaded on access)
GpuSVD<double> svd;
svd.compute(d_A, ComputeThinU | ComputeThinV);
VectorXd S = svd.singularValues(); // downloads to host
MatrixXd U = svd.matrixU(); // downloads to host
MatrixXd V = svd.matrixV(); // V (matches JacobiSVD)
MatrixXd VT = svd.matrixVT(); // V^T (matches cuSOLVER)
// Self-adjoint eigenvalue decomposition
GpuSelfAdjointEigenSolver<double> es(A);
VectorXd eigenvals = es.eigenvalues();
MatrixXd eigenvecs = es.eigenvectors();
// Self-adjoint eigenvalue decomposition (results downloaded on access)
GpuSelfAdjointEigenSolver<double> es;
es.compute(d_A);
VectorXd eigenvals = es.eigenvalues(); // downloads to host
MatrixXd eigenvecs = es.eigenvectors(); // downloads to host
```
The cached API keeps the factored matrix on device, avoiding redundant
host-device transfers and re-factorizations.
host-device transfers and re-factorizations. All solvers also accept host
matrices directly as a convenience (e.g., `GpuLLT<double> llt(A)` or
`qr.solve(B)`), which handles upload/download internally.
### Sparse direct solvers (cuDSS)
Requires cuDSS (separate install, CUDA 12.0+). Define `EIGEN_CUDSS` before
including `Eigen/GPU` and link with `-lcudss`.
```cpp
SparseMatrix<double> A = ...; // symmetric positive definite
VectorXd b = ...;
// Sparse Cholesky -- one-liner
GpuSparseLLT<double> llt(A);
VectorXd x = llt.solve(b);
// Three-phase workflow for repeated solves with the same sparsity pattern
GpuSparseLLT<double> llt;
llt.analyzePattern(A); // symbolic analysis (once)
llt.factorize(A); // numeric factorization
VectorXd x = llt.solve(b);
llt.factorize(A_new_values); // refactorize (reuses symbolic analysis)
VectorXd x2 = llt.solve(b);
// Sparse LDL^T (symmetric indefinite)
GpuSparseLDLT<double> ldlt(A);
VectorXd x = ldlt.solve(b);
// Sparse LU (general non-symmetric)
GpuSparseLU<double> lu(A);
VectorXd x = lu.solve(b);
```
### FFT (cuFFT)
```cpp
GpuFFT<float> fft;
// 1D complex-to-complex
VectorXcf X = fft.fwd(x); // forward
VectorXcf y = fft.inv(X); // inverse (scaled by 1/n)
// 1D real-to-complex / complex-to-real
VectorXcf R = fft.fwd(r); // returns n/2+1 complex (half-spectrum)
VectorXf s = fft.invReal(R, n); // C2R inverse, caller specifies n
// 2D complex-to-complex
MatrixXcf B = fft.fwd2d(A); // 2D forward
MatrixXcf C = fft.inv2d(B); // 2D inverse (scaled by 1/(rows*cols))
// Plans are cached and reused across calls with the same size/type.
```
### Sparse matrix-vector multiply (cuSPARSE)
```cpp
SparseMatrix<double> A = ...;
VectorXd x = ...;
// Host vectors (upload/download handled internally)
GpuSparseContext<double> spmv;
VectorXd y = spmv.multiply(A, x); // y = A * x
VectorXd z = spmv.multiplyT(A, x); // z = A^T * x
spmv.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y
MatrixXd Y = spmv.multiplyMat(A, X); // Y = A * X (SpMM)
// Device-resident SpMV (sparse matrix cached on device)
GpuSparseContext<double> spmv(ctx); // share GpuContext for same-stream
auto d_A = spmv.deviceView(A); // upload sparse matrix once
d_y = d_A * d_x; // operator syntax, stays on device
```
### Eigen algorithm interop (example: Conjugate gradient)
The BLAS-1 operators and `DeviceSparseView` make `DeviceMatrix` usable as a
vector type in GPU implementations of algorithms like conjugate gradient.
Conjugate gradient is the motivating example -- a GPU CG implementation
uses the same operations as the CPU version:
```cpp
GpuContext ctx;
GpuSparseContext<double> spmv(ctx);
auto d_A = spmv.deviceView(A); // sparse matrix on device
auto d_b = DeviceMatrix<double>::fromHost(b);
auto d_x = DeviceMatrix<double>::fromHost(x0);
// CG iteration using DeviceMatrix operators
DeviceMatrix<double> d_r = d_b; // r = b (deep copy via geam)
DeviceMatrix<double> d_p(n), d_tmp(n);
d_tmp = d_A * d_x; // SpMV (device-resident)
d_r -= d_tmp; // axpy
d_p = d_r.clone();
RealScalar absNew = d_r.squaredNorm(); // DeviceScalar -> implicit sync
for (int i = 0; i < maxIters && absNew > tol * tol; ++i) {
d_tmp = d_A * d_p; // SpMV
auto alpha = absNew / d_p.dot(d_tmp); // host / DeviceScalar -> DeviceScalar
d_x += alpha * d_p; // axpy with DeviceScalar
d_r -= alpha * d_tmp; // axpy with DeviceScalar
RealScalar absOld = absNew;
absNew = d_r.squaredNorm(); // DeviceScalar -> implicit sync
d_p *= Scalar(absNew / absOld); // scal (host scalars)
d_p += d_r; // axpy
}
MatrixXd x = d_x.toHost();
```
### Precision control
GEMM dispatch enables tensor core algorithms by default, allowing cuBLAS to
choose the fastest algorithm for the given precision and architecture. For
double precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64
results computed faster via tensor cores.
GEMM dispatch uses `cublasLtMatmul` with heuristic algorithm selection,
enabling cuBLAS to choose tensor core algorithms when beneficial. For double
precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64 results
computed faster via tensor cores.
| Macro | Effect |
|---|---|
@@ -209,6 +398,7 @@ Mandatory sync points:
- `fromHost()` -- Synchronizes to complete the upload before returning
- `toHost()` / `HostTransfer::get()` -- Must deliver data to host
- `info()` -- Must read the factorization status
- `DeviceScalar` implicit conversion -- Downloads scalar from device
**Cross-stream safety** is automatic. `DeviceMatrix` tracks write completion
via CUDA events. When a matrix written on stream A is read on stream B, the
@@ -219,51 +409,121 @@ skip the wait (CUDA guarantees in-order execution within a stream).
### Supported scalar types
`float`, `double`, `std::complex<float>`, `std::complex<double>`.
`float`, `double`, `std::complex<float>`, `std::complex<double>` (unless
noted otherwise).
### Expression -> library call mapping
| DeviceMatrix expression | Library call | Parameters |
|---|---|---|
| `C = A * B` | `cublasGemmEx` | transA=N, transB=N, alpha=1, beta=0 |
| `C = A.adjoint() * B` | `cublasGemmEx` | transA=C, transB=N |
| `C = A.transpose() * B` | `cublasGemmEx` | transA=T, transB=N |
| `C = A * B.adjoint()` | `cublasGemmEx` | transA=N, transB=C |
| `C = A * B.transpose()` | `cublasGemmEx` | transA=N, transB=T |
| `C = alpha * A * B` | `cublasGemmEx` | alpha from LHS |
| `C = A * (alpha * B)` | `cublasGemmEx` | alpha from RHS |
| `C += A * B` | `cublasGemmEx` | alpha=1, beta=1 |
| `C.device(ctx) -= A * B` | `cublasGemmEx` | alpha=-1, beta=1 |
| `C = A * B` | `cublasLtMatmul` | transA=N, transB=N, alpha=1, beta=0 |
| `C = A.adjoint() * B` | `cublasLtMatmul` | transA=C, transB=N |
| `C = A.transpose() * B` | `cublasLtMatmul` | transA=T, transB=N |
| `C = A * B.adjoint()` | `cublasLtMatmul` | transA=N, transB=C |
| `C = A * B.transpose()` | `cublasLtMatmul` | transA=N, transB=T |
| `C = alpha * A * B` | `cublasLtMatmul` | alpha from LHS |
| `C = A * (alpha * B)` | `cublasLtMatmul` | alpha from RHS |
| `C += A * B` | `cublasLtMatmul` | alpha=1, beta=1 |
| `C.device(ctx) -= A * B` | `cublasLtMatmul` | alpha=-1, beta=1 |
| `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs |
| `X = A.llt<Upper>().solve(B)` | same | uplo=Upper |
| `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs |
| `X = A.triangularView<L>().solve(B)` | `cublasXtrsm` | side=L, uplo, diag=NonUnit |
| `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
| `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
| `C = A + B` | `cublasXgeam` | alpha=1, beta=1 |
| `C = A + alpha * B` | `cublasXgeam` | alpha=1, beta from scaled |
| `C = A - B` | `cublasXgeam` | alpha=1, beta=-1 |
| `C = A - alpha * B` | `cublasXgeam` | alpha=1, beta=-scaled |
| `x += alpha * y` | `cublasXaxpy` | alpha (host scalar) |
| `x += dAlpha * y` | `cublasXaxpy` | alpha (DeviceScalar, device pointer mode) |
| `x -= alpha * y` | `cublasXaxpy` | alpha negated |
| `x *= alpha` | `cublasXscal` | alpha (host or DeviceScalar) |
| `x.dot(y)` | `cublasXdot` / `cublasXdotc` | returns `DeviceScalar` |
| `x.norm()` | `cublasXnrm2` | returns `DeviceScalar<RealScalar>` |
| `x.squaredNorm()` | `cublasXdot(x, x)` | returns `DeviceScalar<RealScalar>` |
| `d_y = view * d_x` | `cusparseSpMV` | device-resident SpMV |
### `DeviceMatrix<Scalar>` API
### `DeviceMatrix<Scalar>`
| Method | Sync? | Description |
|--------|-------|-------------|
| `DeviceMatrix()` | -- | Empty (0x0) |
| `DeviceMatrix(rows, cols)` | -- | Allocate uninitialized |
| `fromHost(matrix, stream)` | yes | Upload from Eigen matrix |
| `fromHostAsync(ptr, rows, cols, outerStride, stream)` | no | Async upload (caller manages lifetime) |
| `toHost(stream)` | yes | Synchronous download |
| `toHostAsync(stream)` | no | Returns `HostTransfer` future |
| `clone(stream)` | no | Device-to-device deep copy |
| `resize(rows, cols)` | -- | Discard contents, reallocate |
| `data()` | -- | Raw device pointer |
| `rows()`, `cols()` | -- | Dimensions |
| `sizeInBytes()` | -- | Total device allocation size in bytes |
| `empty()` | -- | True if 0x0 |
| `adjoint()` | -- | Adjoint view (GEMM ConjTrans) |
| `transpose()` | -- | Transpose view (GEMM Trans) |
| `llt()` / `llt<UpLo>()` | -- | Cholesky expression builder |
| `lu()` | -- | LU expression builder |
| `triangularView<UpLo>()` | -- | Triangular view (TRSM) |
| `selfadjointView<UpLo>()` | -- | Self-adjoint view (SYMM, rankUpdate) |
| `device(ctx)` | -- | Assignment proxy bound to context |
Typed RAII wrapper for a dense column-major matrix in GPU device memory.
Always dense (leading dimension = rows). A vector is a `DeviceMatrix` with
one column.
```cpp
// Construction
DeviceMatrix<Scalar>() // Empty (0x0)
DeviceMatrix<Scalar>(Index n) // Allocate column vector (n x 1)
DeviceMatrix<Scalar>(rows, cols) // Allocate uninitialized
// Upload / download
static DeviceMatrix fromHost(matrix, stream=nullptr) // -> DeviceMatrix (syncs)
static DeviceMatrix fromHostAsync(ptr, rows, cols, stream) // -> DeviceMatrix (no sync, caller manages ptr lifetime)
PlainMatrix toHost(stream=nullptr) // -> host Matrix (syncs)
HostTransfer toHostAsync(stream=nullptr) // -> HostTransfer future (no sync)
DeviceMatrix clone(stream=nullptr) // -> DeviceMatrix (D2D copy, async)
// Dimensions and access
Index rows()
Index cols()
size_t sizeInBytes()
bool empty()
Scalar* data() // Raw device pointer
void resize(Index rows, Index cols) // Discard contents, reallocate
// Expression builders (return lightweight views, evaluated on assignment)
AdjointView adjoint() // GEMM with ConjTrans
TransposeView transpose() // GEMM with Trans
LltExpr llt() / llt<UpLo>() // -> .solve(d_B) -> DeviceMatrix
LuExpr lu() // -> .solve(d_B) -> DeviceMatrix
TriangularView triangularView<UpLo>() // -> .solve(d_B) -> DeviceMatrix (TRSM)
SelfAdjointView selfadjointView<UpLo>() // -> * d_B (SYMM), .rankUpdate(d_A) (SYRK)
DeviceAssignment device(GpuContext& ctx) // Bind assignment to explicit stream
DeviceMatrix& noalias() // No-op (all ops are implicitly noalias)
// BLAS Level-1 (all have overloads with explicit GpuContext& parameter)
DeviceScalar<Scalar> dot(const DeviceMatrix& other) // cuBLAS dot/dotc -> DeviceScalar
DeviceScalar<RealScalar> norm() // cuBLAS nrm2 -> DeviceScalar
DeviceScalar<RealScalar> squaredNorm() // dot(self, self) -> DeviceScalar (no sync)
void setZero() // cudaMemsetAsync
void addScaled(GpuContext&, Scalar alpha, const DeviceMatrix& x) // this += alpha * x (axpy)
void scale(GpuContext&, Scalar alpha) // this *= alpha (scal)
void copyFrom(GpuContext&, const DeviceMatrix& other) // this = other (D2D copy)
DeviceMatrix& operator+=(Scalar * DeviceMatrix) // cuBLAS axpy
DeviceMatrix& operator-=(Scalar * DeviceMatrix) // cuBLAS axpy (negated)
DeviceMatrix& operator+=(const DeviceMatrix&) // cuBLAS axpy
DeviceMatrix& operator-=(const DeviceMatrix&) // cuBLAS axpy
DeviceMatrix& operator+=(const DeviceScaledDevice&) // cuBLAS axpy (DeviceScalar * DeviceMatrix)
DeviceMatrix& operator-=(const DeviceScaledDevice&) // cuBLAS axpy (DeviceScalar * DeviceMatrix, negated)
DeviceMatrix& operator*=(Scalar) // cuBLAS scal
DeviceMatrix& operator*=(const DeviceScalar<Scalar>&) // cuBLAS scal (device pointer)
DeviceMatrix cwiseProduct(GpuContext&, const DeviceMatrix&) // NPP nppsMul (float/double only)
void cwiseProduct(GpuContext&, const DeviceMatrix&, const DeviceMatrix&) // in-place: this = a .* b
// geam expressions (evaluated on assignment)
DeviceMatrix& operator=(const DeviceAddExpr&) // C = A + B, C = A + alpha*B, C = A - B, etc.
```
### `DeviceScalar<Scalar>`
Device-resident scalar. Returned by `dot()`, `norm()`, and `squaredNorm()`.
Implicit conversion to `Scalar` triggers `cudaStreamSynchronize` + download.
```cpp
DeviceScalar(cudaStream_t stream = nullptr) // Allocate uninitialized
DeviceScalar(Scalar host_val, cudaStream_t stream) // Upload host value
Scalar get() // Download (syncs stream)
operator Scalar() // Implicit conversion (syncs)
Scalar* devicePtr() // Raw device pointer
cudaStream_t stream()
// Device-side arithmetic (no host sync, real types only)
DeviceScalar operator/(DeviceScalar, DeviceScalar) // NPP nppsDiv
DeviceScalar operator/(Scalar, DeviceScalar) // upload + div
DeviceScalar operator/(DeviceScalar, Scalar) // upload + div
DeviceScalar operator-() // NPP nppsMulC(-1)
```
### `GpuContext`
@@ -271,101 +531,221 @@ Unified GPU execution context owning a CUDA stream and library handles.
```cpp
GpuContext() // Creates dedicated stream + handles
GpuContext(cudaStream_t stream) // Borrow existing stream (not owned)
static GpuContext& threadLocal() // Per-thread default (lazy-created)
static void setThreadLocal(GpuContext* ctx) // Override thread-local default (nullptr restores)
cudaStream_t stream()
cublasHandle_t cublasHandle()
cusolverDnHandle_t cusolverHandle()
cublasLtHandle_t cublasLtHandle() // Lazy-initialized
cusparseHandle_t cusparseHandle() // Lazy-initialized
```
Non-copyable, non-movable (owns library handles).
### `GpuLLT<Scalar, UpLo>` API
### `GpuLLT<Scalar, UpLo>` -- Dense Cholesky (cuSOLVER)
GPU dense Cholesky (LL^T) via cuSOLVER. Caches factor on device.
Caches the Cholesky factor on device for repeated solves.
| Method | Sync? | Description |
|--------|-------|-------------|
| `GpuLLT(A)` | deferred | Construct and factorize from host matrix |
| `compute(host_matrix)` | deferred | Upload and factorize |
| `compute(DeviceMatrix)` | deferred | D2D copy and factorize |
| `compute(DeviceMatrix&&)` | deferred | Move-adopt and factorize (no copy) |
| `solve(host_matrix)` | yes | Solve, return host matrix |
| `solve(DeviceMatrix)` | no | Solve, return `DeviceMatrix` (async) |
| `info()` | lazy | Syncs stream on first call, returns `Success` or `NumericalIssue` |
```cpp
GpuLLT() // Default construct, then call compute()
GpuLLT(const EigenBase<D>& A) // Convenience: upload + factorize
### `GpuLU<Scalar>` API
GpuLLT& compute(const EigenBase<D>& A) // Upload + factorize
GpuLLT& compute(const DeviceMatrix& d_A) // D2D copy + factorize
GpuLLT& compute(DeviceMatrix&& d_A) // Adopt + factorize (no copy)
GPU dense partial-pivoting LU via cuSOLVER. Same pattern as `GpuLLT`, plus
`TransposeMode` parameter on `solve()` (`NoTranspose`, `Transpose`,
`ConjugateTranspose`).
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
DeviceMatrix solve(const DeviceMatrix& d_B) // -> DeviceMatrix (async, stays on device)
### `GpuQR<Scalar>` API
ComputationInfo info() // Lazy sync on first call: Success or NumericalIssue
Index rows() / cols()
cudaStream_t stream()
```
GPU dense QR decomposition via cuSOLVER (`geqrf`). Solve uses `ormqr` (apply
Q^H) + `trsm` (back-substitute on R) -- Q is never formed explicitly.
### `GpuLU<Scalar>` -- Dense LU (cuSOLVER)
| Method | Description |
|--------|-------------|
| `GpuQR()` | Default construct, then call `compute()` |
| `GpuQR(A)` | Construct and factorize from host matrix |
| `compute(A)` | Upload + factorize |
| `compute(DeviceMatrix)` | D2D copy + factorize |
| `solve(host_matrix)` | Solve, return host matrix (syncs) |
| `solve(DeviceMatrix)` | Solve, return `DeviceMatrix` (async) |
| `info()` | Lazy sync |
| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
Same pattern as `GpuLLT`. Adds `TransposeMode` parameter on `solve()`.
### `GpuSVD<Scalar>` API
```cpp
PlainMatrix solve(const MatrixBase<D>& B, TransposeMode m = NoTranspose) // -> host Matrix
DeviceMatrix solve(const DeviceMatrix& d_B, TransposeMode m = NoTranspose) // -> DeviceMatrix
```
GPU dense SVD via cuSOLVER (`gesvd`). Supports thin, full, and values-only
modes via Eigen's `ComputeThinU | ComputeThinV`, `ComputeFullU | ComputeFullV`,
or `0` (values only).
`TransposeMode`: `NoTranspose`, `Transpose`, `ConjugateTranspose`.
| Method | Description |
|--------|-------------|
| `GpuSVD()` | Default construct, then call `compute()` |
| `GpuSVD(A, options)` | Construct and compute (options default: `ComputeThinU \| ComputeThinV`) |
| `compute(A, options)` | Compute from host matrix |
| `compute(DeviceMatrix, options)` | Compute from device matrix |
| `singularValues()` | Download singular values to host |
| `matrixU()` | Download U to host (requires `ComputeThinU` or `ComputeFullU`) |
| `matrixVT()` | Download V^T to host (requires `ComputeThinV` or `ComputeFullV`) |
| `solve(B)` | Pseudoinverse solve (returns host matrix) |
| `solve(B, k)` | Truncated solve (top k singular triplets) |
| `solve(B, lambda)` | Tikhonov regularized solve |
| `rank(threshold)` | Count singular values above threshold |
| `info()` | Lazy sync |
| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
### `GpuQR<Scalar>` -- Dense QR (cuSOLVER)
Wide matrices (m < n) are handled by internally transposing via cuBLAS `geam`.
QR factorization via `cusolverDnXgeqrf`. Solve uses ORMQR (apply Q^H) + TRSM
(back-substitute on R) -- Q is never formed explicitly.
### `GpuSelfAdjointEigenSolver<Scalar>` API
```cpp
GpuQR() // Default construct
GpuQR(const EigenBase<D>& A) // Convenience: upload + factorize
GPU symmetric/Hermitian eigenvalue decomposition via cuSOLVER (`syevd`).
GpuQR& compute(const EigenBase<D>& A) // Upload + factorize
GpuQR& compute(const DeviceMatrix& d_A) // D2D copy + factorize
| Method | Description |
|--------|-------------|
| `GpuSelfAdjointEigenSolver()` | Default construct, then call `compute()` |
| `GpuSelfAdjointEigenSolver(A, mode)` | Construct and compute (mode default: `ComputeEigenvectors`) |
| `compute(A, mode)` | Compute from host matrix |
| `compute(DeviceMatrix, mode)` | Compute from device matrix |
| `eigenvalues()` | Download eigenvalues to host (ascending order) |
| `eigenvectors()` | Download eigenvectors to host (columns) |
| `info()` | Lazy sync |
| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
DeviceMatrix solve(const DeviceMatrix& d_B) // -> DeviceMatrix (async)
`ComputeMode`: `GpuSelfAdjointEigenSolver::EigenvaluesOnly` or
`GpuSelfAdjointEigenSolver::ComputeEigenvectors`.
ComputationInfo info() // Lazy sync
Index rows() / cols()
cudaStream_t stream()
```
### `HostTransfer<Scalar>` API
### `GpuSVD<Scalar>` -- Dense SVD (cuSOLVER)
Future for async device-to-host transfer.
SVD via `cusolverDnXgesvd`. Supports `ComputeThinU | ComputeThinV`,
`ComputeFullU | ComputeFullV`, or `0` (values only). Wide matrices (m < n)
handled by internal transpose.
| Method | Description |
|--------|-------------|
| `get()` | Block until transfer completes, return host matrix reference. Idempotent. |
| `ready()` | Non-blocking poll |
```cpp
GpuSVD() // Default construct, then call compute()
GpuSVD(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV) // Convenience
GpuSVD& compute(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV)
GpuSVD& compute(const DeviceMatrix& d_A, unsigned options = ComputeThinU | ComputeThinV)
RealVector singularValues() // -> host vector (syncs, downloads)
PlainMatrix matrixU() // -> host Matrix (syncs, downloads)
PlainMatrix matrixV() // -> host Matrix (V = VT^H, matches JacobiSVD)
PlainMatrix matrixVT() // -> host Matrix (syncs, downloads V^T)
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (pseudoinverse)
PlainMatrix solve(const MatrixBase<D>& B, Index k) // Truncated (top k triplets)
PlainMatrix solve(const MatrixBase<D>& B, RealScalar l) // Tikhonov regularized
Index rank(RealScalar threshold = -1)
ComputationInfo info() // Lazy sync
Index rows() / cols()
cudaStream_t stream()
```
**Note:** `singularValues()`, `matrixU()`, `matrixV()`, and `matrixVT()`
download to host on each call. Device-side accessors returning `DeviceMatrix`
are planned but not yet implemented.
### `GpuSelfAdjointEigenSolver<Scalar>` -- Eigendecomposition (cuSOLVER)
Symmetric/Hermitian eigenvalue decomposition via `cusolverDnXsyevd`.
`ComputeMode` enum: `EigenvaluesOnly`, `ComputeEigenvectors`.
```cpp
GpuSelfAdjointEigenSolver() // Default construct, then call compute()
GpuSelfAdjointEigenSolver(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors) // Convenience
GpuSelfAdjointEigenSolver& compute(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors)
GpuSelfAdjointEigenSolver& compute(const DeviceMatrix& d_A, ComputeMode mode = ComputeEigenvectors)
RealVector eigenvalues() // -> host vector (syncs, downloads, ascending order)
PlainMatrix eigenvectors() // -> host Matrix (syncs, downloads, columns)
ComputationInfo info() // Lazy sync
Index rows() / cols()
cudaStream_t stream()
```
**Note:** `eigenvalues()` and `eigenvectors()` download to host on each call.
Device-side accessors returning `DeviceMatrix` are planned but not yet
implemented.
### `HostTransfer<Scalar>`
Future for async device-to-host transfer. Returned by
`DeviceMatrix::toHostAsync()`.
```cpp
PlainMatrix& get() // Block until complete, return host Matrix ref. Idempotent.
bool ready() // Non-blocking poll
```
### `GpuSparseLLT<Scalar, UpLo>` -- Sparse Cholesky (cuDSS)
Requires cuDSS (CUDA 12.0+, `#define EIGEN_CUDSS`). Three-phase workflow
with symbolic reuse. Accepts `SparseMatrix<Scalar, ColMajor, int>` (CSC).
```cpp
GpuSparseLLT() // Default construct
GpuSparseLLT(const SparseMatrixBase<D>& A) // Analyze + factorize
GpuSparseLLT& analyzePattern(const SparseMatrixBase<D>& A) // Symbolic analysis (reusable)
GpuSparseLLT& factorize(const SparseMatrixBase<D>& A) // Numeric factorization
GpuSparseLLT& compute(const SparseMatrixBase<D>& A) // analyzePattern + factorize
void setOrdering(GpuSparseOrdering ord) // AMD (default), METIS, or RCM
DenseMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
ComputationInfo info() // Lazy sync
Index rows() / cols()
cudaStream_t stream()
```
### `GpuSparseLDLT<Scalar, UpLo>` -- Sparse LDL^T (cuDSS)
Symmetric indefinite. Same API as `GpuSparseLLT`.
### `GpuSparseLU<Scalar>` -- Sparse LU (cuDSS)
General non-symmetric. Same API as `GpuSparseLLT` (without `UpLo`).
### `GpuFFT<Scalar>` -- FFT (cuFFT)
Plans cached by (size, type) and reused. Inverse transforms scaled so
`inv(fwd(x)) == x`. Supported scalars: `float`, `double`.
```cpp
// 1D transforms (host vectors in and out)
ComplexVector fwd(const MatrixBase<D>& x) // C2C forward (complex input)
ComplexVector fwd(const MatrixBase<D>& x) // R2C forward (real input, returns n/2+1)
ComplexVector inv(const MatrixBase<D>& X) // C2C inverse, scaled by 1/n
RealVector invReal(const MatrixBase<D>& X, Index n) // C2R inverse, scaled by 1/n
// 2D transforms (host matrices in and out)
ComplexMatrix fwd2d(const MatrixBase<D>& A) // 2D C2C forward
ComplexMatrix inv2d(const MatrixBase<D>& A) // 2D C2C inverse, scaled by 1/(rows*cols)
cudaStream_t stream()
```
All FFT methods accept host data and return host data. Upload/download is
handled internally. The C2C and R2C overloads of `fwd()` are distinguished by
the input scalar type (complex vs real).
### `GpuSparseContext<Scalar>` -- SpMV/SpMM (cuSPARSE)
Accepts `SparseMatrix<Scalar, ColMajor>`.
```cpp
GpuSparseContext() // Creates own stream + cuSPARSE handle
GpuSparseContext(GpuContext& ctx) // Borrow GpuContext for same-stream execution
// Host data in/out
DenseVector multiply(A, x) // y = A * x
void multiply(A, x, y, alpha=1, beta=0, // y = alpha*op(A)*x + beta*y
op=CUSPARSE_OPERATION_NON_TRANSPOSE)
DenseVector multiplyT(A, x) // y = A^T * x
DenseMatrix multiplyMat(A, X) // Y = A * X (SpMM)
// DeviceMatrix in/out (sparse matrix re-uploaded each call)
void multiply(A, d_x, d_y) // SpMV with device vectors
void multiply(A, d_x, d_y, alpha, beta, op)
// Device-resident sparse matrix (upload once, reuse)
DeviceSparseView deviceView(A) // Upload sparse matrix, return view
cudaStream_t stream()
```
### `DeviceSparseView<Scalar>` -- Device-resident sparse matrix
Returned by `GpuSparseContext::deviceView()`. Holds a sparse matrix on device
for repeated SpMV without re-uploading.
```cpp
SpMVExpr operator*(const DeviceMatrix& d_x) // d_y = view * d_x (evaluated on assignment)
```
### Aliasing
@@ -373,7 +753,9 @@ Unlike Eigen's `Matrix`, where omitting `.noalias()` triggers a copy to a
temporary, DeviceMatrix dispatches directly to NVIDIA library calls which have
no built-in aliasing protection. All operations are implicitly noalias.
The caller must ensure operands don't alias the destination for GEMM and TRSM
(debug asserts catch violations).
(debug asserts catch violations). `geam` expressions (`d_C = d_A + alpha * d_B`)
are safe with aliasing. The `.noalias()` method exists as a no-op for Eigen
template compatibility.
## File layout
@@ -381,18 +763,29 @@ The caller must ensure operands don't alias the destination for GEMM and TRSM
|------|-----------|----------|
| `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `cuda_data_type<>` |
| `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` |
| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM expression wrappers |
| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM and geam expression wrappers |
| `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers |
| `DeviceSolverExpr.h` | `DeviceMatrix.h` | Solver expression wrappers (LLT, LU) |
| `DeviceScalar.h` | `GpuSupport.h`, `DeviceScalarOps.h` | `DeviceScalar<>` (device-resident scalar) |
| `DeviceScalarOps.h` | `<npps_*.h>` | Scalar div/neg/cwiseProduct via NPP |
| `DeviceDispatch.h` | all above | All dispatch functions + `DeviceAssignment` |
| `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `GpuContext` |
| `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>` | cuBLAS error macro, op/compute type maps |
| `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>`, `<cublasLt.h>` | cuBLAS/cuBLASLt error macro, type maps |
| `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
| `GpuQR.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense QR decomposition |
| `GpuSVD.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense SVD decomposition |
| `GpuEigenSolver.h` | `CuSolverSupport.h` | Self-adjoint eigenvalue decomposition |
| `CuFftSupport.h` | `GpuSupport.h`, `<cufft.h>` | cuFFT error macro, type-dispatch wrappers |
| `GpuFFT.h` | `CuFftSupport.h`, `CuBlasSupport.h` | 1D/2D FFT with plan caching |
| `CuSparseSupport.h` | `GpuSupport.h`, `<cusparse.h>` | cuSPARSE error macro |
| `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE, `DeviceSparseView` |
| `CuDssSupport.h` | `GpuSupport.h`, `<cudss.h>` | cuDSS error macro, type traits (optional) |
| `GpuSparseSolverBase.h` | `CuDssSupport.h` | CRTP base for sparse solvers (optional) |
| `GpuSparseLLT.h` | `GpuSparseSolverBase.h` | Sparse Cholesky via cuDSS (optional) |
| `GpuSparseLDLT.h` | `GpuSparseSolverBase.h` | Sparse LDL^T via cuDSS (optional) |
| `GpuSparseLU.h` | `GpuSparseSolverBase.h` | Sparse LU via cuDSS (optional) |
## Building and testing
@@ -404,6 +797,41 @@ cmake -G Ninja -B build -S . \
-DEIGEN_TEST_CUSOLVER=ON
cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu \
gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen gpu_device_matrix
ctest --test-dir build -R "gpu_cublas|gpu_cusolver|gpu_device" --output-on-failure
gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen \
gpu_device_matrix gpu_cufft gpu_cusparse_spmv gpu_cg
ctest --test-dir build -R "gpu_" --output-on-failure
# Sparse solvers (cuDSS -- separate install required)
cmake -G Ninja -B build -S . \
-DEIGEN_TEST_CUDA=ON \
-DEIGEN_CUDA_COMPUTE_ARCH="70" \
-DEIGEN_TEST_CUDSS=ON
cmake --build build --target gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu
ctest --test-dir build -R gpu_cudss --output-on-failure
```
## Future work
- **Device-side accessors for decomposition results.** `GpuSVD`,
`GpuSelfAdjointEigenSolver`, and `GpuQR` currently download decomposition
results to host on access (e.g., `svd.matrixU()` returns a host `MatrixXd`).
Device-side accessors returning `DeviceMatrix` views of the internal buffers
would allow chaining GPU operations (e.g., `svd.deviceU() * d_A`) without
round-tripping through host memory.
- **Batched API (`DeviceBatchMatrix`).** A strided batch of N identical-size
matrices dispatching to cuBLAS/cuSOLVER batched APIs (`cublasDgemmBatched`,
`cusolverDnXpotrfBatched`, etc.). This enables robotics and model-predictive
control workloads where many small independent systems are solved in
parallel.
- **cuTENSOR for Tensor module.** Replace the hand-written GPU tensor
contraction and reduction kernels (~2300 lines in
`TensorContractionGpu.h` / `TensorReductionGpu.h`) with cuTENSOR dispatch,
following the same library-dispatch pattern used by `Eigen/GPU`.
- **Unified/zero-copy memory for Jetson.** Use `cudaMallocManaged` or
`cudaHostAllocMapped` to eliminate `fromHost()` / `toHost()` copies on
integrated GPUs (Jetson) where CPU and GPU share DRAM.
- **Device-side Eigen interop.** Bridge between host-side `DeviceMatrix`
dispatch and device-side Eigen expression templates (Core + Tensor) running
inside CUDA kernels. Raw-pointer + `Map` / `TensorMap` as the zero-copy
interop surface.

View File

@@ -31,7 +31,10 @@ EIGEN_DONT_INLINE void conjugate_gradient(const MatrixType& mat, const Rhs& rhs,
Index& iters, typename Dest::RealScalar& tol_error) {
typedef typename Dest::RealScalar RealScalar;
typedef typename Dest::Scalar Scalar;
typedef Matrix<Scalar, Dynamic, 1> VectorType;
// Use Dest's plain (owning) type as VectorType. For CPU Matrix/Map this
// resolves to Matrix<Scalar,Dynamic,1>. For GPU DeviceMatrix, PlainObject
// is DeviceMatrix itself (already owning).
typedef typename Dest::PlainObject VectorType;
RealScalar tol = tol_error;
Index maxIters = iters;

View File

@@ -11,7 +11,7 @@
# ncu --set full -o profile ./build-bench-gpu/bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
cmake_minimum_required(VERSION 3.18)
project(EigenGpuBenchmarks CXX)
project(EigenGpuBenchmarks CXX CUDA)
find_package(benchmark REQUIRED)
find_package(CUDAToolkit REQUIRED)
@@ -51,3 +51,41 @@ eigen_add_gpu_benchmark(bench_gpu_chaining_float bench_gpu_chaining.cpp DEFINITI
# Batching benchmarks: multi-stream concurrency for many small systems.
eigen_add_gpu_benchmark(bench_gpu_batching bench_gpu_batching.cpp)
eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITIONS SCALAR=float)
# FFT benchmarks: 1D/2D C2C, R2C, C2R throughput and plan reuse.
eigen_add_gpu_benchmark(bench_gpu_fft bench_gpu_fft.cpp LIBRARIES CUDA::cufft)
eigen_add_gpu_benchmark(bench_gpu_fft_double bench_gpu_fft.cpp LIBRARIES CUDA::cufft DEFINITIONS SCALAR=double)
# CG sync overhead benchmark: host vs device pointer mode for reductions.
# Uses CUDA kernels for device scalar arithmetic.
add_executable(bench_gpu_cg_sync bench_gpu_cg_sync.cu)
target_include_directories(bench_gpu_cg_sync PRIVATE
${EIGEN_SOURCE_DIR}
${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(bench_gpu_cg_sync PRIVATE
benchmark::benchmark benchmark::benchmark_main
CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
target_compile_options(bench_gpu_cg_sync PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
target_compile_definitions(bench_gpu_cg_sync PRIVATE EIGEN_USE_GPU)
# GPU CG vs CPU CG comparison benchmark.
add_executable(bench_gpu_cg_vs_cpu bench_gpu_cg_vs_cpu.cu)
target_include_directories(bench_gpu_cg_vs_cpu PRIVATE
${EIGEN_SOURCE_DIR}
${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(bench_gpu_cg_vs_cpu PRIVATE
benchmark::benchmark benchmark::benchmark_main
CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
target_compile_options(bench_gpu_cg_vs_cpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
target_compile_definitions(bench_gpu_cg_vs_cpu PRIVATE EIGEN_USE_GPU)
# Bundle Adjustment benchmark: GPU CG vs CPU CG on real BAL datasets.
add_executable(bench_gpu_ba bench_gpu_ba.cu)
target_include_directories(bench_gpu_ba PRIVATE
${EIGEN_SOURCE_DIR}
${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(bench_gpu_ba PRIVATE
benchmark::benchmark
CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
target_compile_options(bench_gpu_ba PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
target_compile_definitions(bench_gpu_ba PRIVATE EIGEN_USE_GPU)

View File

@@ -0,0 +1,149 @@
# Bundle Adjustment: GPU CG vs CPU CG Results
Benchmark of Eigen's GPU CG pipeline on normal equations arising from bundle
adjustment (BAL datasets). Compares CPU `ConjugateGradient` (Jacobi preconditioner)
against GPU CG using `DeviceMatrix` + `GpuSparseContext` + `DeviceScalar`.
## Hardware
- **CPU**: Intel Core i7-13700HX (Raptor Lake, 12 cores / 24 threads, single thread for Eigen CG)
- **GPU**: NVIDIA GeForce RTX 4070 Laptop GPU (Ada Lovelace, 4608 CUDA cores, 8 GB GDDR6)
- **CUDA**: 13.2 / Driver 595.79
- **OS**: Ubuntu 24.04 (WSL2, kernel 6.6.87)
## Software
- Eigen: `eigen-gpu-cg` branch
- Google Benchmark 1.9.1
- Compiler: nvcc 13.2 + g++ 13.3
- Normal equations: H = J^T*J + I (Levenberg-Marquardt damping lambda=1.0)
- CG tolerance: 1e-8, max iterations: 10000
## Method
For each BAL problem file:
1. Parse the BAL file (cameras, 3D points, 2D observations)
2. Compute the full Jacobian J using the BAL camera model (Rodrigues rotation +
perspective projection + radial distortion) with central finite differences
3. Form the normal equations H = J^T*J + lambda*I (sparse, symmetric positive definite)
4. Solve H*dx = -J^T*r using CG with Jacobi preconditioner on CPU and GPU
5. Report wall-clock time (mean of 3 repetitions)
GPU CG uses: `GpuSparseContext` for SpMV, `DeviceMatrix` for vectors,
`DeviceScalar` with `CUBLAS_POINTER_MODE_DEVICE` for dot/norm reductions,
in-place `cwiseProduct` via NPP for Jacobi preconditioner application,
device-pointer-mode `scal` to avoid host sync on the beta update.
## Results
### Summary table
| Dataset | Cameras | Points | Obs | H size | H nnz | CG iters | CPU CG (ms) | GPU CG (ms) | Speedup |
|---------|---------|--------|-----|--------|-------|----------|-------------|-------------|---------|
| Ladybug-49 | 49 | 7,776 | 31,843 | 23,769 | 1.8M | 4,421 | 4,006 | 1,152 | **3.5x** |
| Ladybug-138 | 138 | 19,878 | 85,217 | 60,876 | 4.8M | 7,008 | 21,498 | 3,553 | **6.1x** |
| Ladybug-646 | 646 | 73,584 | 327,297 | 226,566 | 18.4M | 10,000* | 123,727 | 14,268 | **8.7x** |
| Dubrovnik-356 | 356 | 226,730 | 1,255,268 | 683,394 | 69.8M | 4,308 | 216,149 | 24,493 | **8.8x** |
\* Hit 10,000 iteration cap (poorly conditioned problem). Both CPU and GPU
hit the same cap, so timing comparison remains valid.
### Profile breakdown (Ladybug-138, nsys)
GPU kernel time is dominated by SpMV (91%). The remaining 9% is BLAS-1
operations (dot, axpy, scal) and NPP element-wise ops (cwiseProduct).
| Kernel | Time (ms) | % | Calls |
|--------|-----------|---|-------|
| cuSPARSE csrmv (SpMV) | 2507 | 91.3% | 7,006 |
| cuBLAS dot | 92 | 3.4% | 21,020 |
| cuBLAS axpy (device ptr) | 27 | 1.0% | 14,012 |
| cuSPARSE partition | 19 | 0.7% | 7,006 |
| NPP cwiseProduct | 16 + 13 | 1.1% | 14,011 + 7,006 |
| cuBLAS axpy (host ptr) | 12 | 0.5% | 7,005 |
| cuBLAS scal (device ptr) | 11 | 0.4% | 7,005 |
| NPP scalar ops | 7 | 0.2% | 7,006 |
### Optimizations applied
Three profiling-driven optimizations reduced GPU CG time by **1.8x**
(6.5s → 3.6s on Ladybug-138):
1. **In-place `cwiseProduct`**: The Jacobi preconditioner apply
(`z = invdiag .* residual`) was allocating a new DeviceMatrix every
iteration. Added `z.cwiseProduct(ctx, a, b)` that reuses `z`'s buffer.
Reduced `cudaMalloc` calls from 7,053 to 23 (saving 2.3s).
2. **`squaredNorm` via `dot(x,x)`**: cuBLAS `nrm2` uses a numerically
careful scaled-sum-of-squares algorithm (29µs/call). Replaced with
`dot(x,x)` (6.4µs/call) — 4.5x faster per call, saving ~320ms.
3. **Device-pointer `scal`**: `p *= beta` was converting `DeviceScalar`
beta to host (triggering a stream sync), then calling host-pointer-mode
scal. Added `operator*=(DeviceScalar)` that uses device-pointer-mode
scal, eliminating one sync per iteration. Halved `cudaStreamSynchronize`
calls from 14K to 7K.
### Observations
1. **GPU speedup scales with problem size**: from 3.5x on small problems
(24K variables) to 8.8x on large problems (683K variables). This is
expected — larger problems have more parallelism for the GPU to exploit.
2. **Iteration counts match**: CPU and GPU CG converge in the same number
of iterations (within 1%), confirming numerical equivalence.
3. **Bottleneck is SpMV**: CG iteration time is dominated (91%) by the
sparse matrix-vector product on H. Further speedup requires either
faster SpMV (e.g., block-sparse formats) or algorithmic improvements
(Schur complement, better preconditioners).
4. **Remaining overhead**: CUDA API calls (cudaMemcpyAsync for 8-byte
DeviceScalar transfers) account for ~50% of non-kernel time. Batching
multiple scalar reductions into a single transfer would help.
5. **Jacobi preconditioner is weak for BA**: The Ladybug-646 problem does
not converge in 10K iterations. Ceres uses block Jacobi or Schur
complement preconditioners that would also benefit from GPU acceleration.
### Scaling plot data
```
# n nnz_H cpu_ms gpu_ms speedup
23769 1793475 4006 1152 3.48
60876 4791762 21498 3553 6.05
226566 18387948 123727 14268 8.67
683394 69827066 216149 24493 8.82
```
## BAL datasets
Downloaded from http://grail.cs.washington.edu/projects/bal/
| File | Source |
|------|--------|
| problem-49-7776-pre.txt | Ladybug sequence |
| problem-138-19878-pre.txt | Ladybug sequence |
| problem-646-73584-pre.txt | Ladybug sequence |
| problem-356-226730-pre.txt | Dubrovnik reconstruction |
## Reproducing
```bash
# Build
cmake -G Ninja -B build-bench-gpu -S benchmarks/GPU -DCMAKE_CUDA_ARCHITECTURES=89
cmake --build build-bench-gpu --target bench_gpu_ba
# Download BAL datasets
wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-49-7776-pre.txt.bz2
wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-138-19878-pre.txt.bz2
wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-646-73584-pre.txt.bz2
wget http://grail.cs.washington.edu/projects/bal/data/dubrovnik/problem-356-226730-pre.txt.bz2
bunzip2 *.bz2
# Run (one at a time)
BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
BAL_FILE=problem-138-19878-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
BAL_FILE=problem-646-73584-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
BAL_FILE=problem-356-226730-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
```

View File

@@ -0,0 +1,533 @@
// Bundle Adjustment benchmark: GPU CG vs CPU CG on real BAL datasets.
//
// Tests Eigen's GPU CG pipeline (DeviceMatrix + GpuSparseContext + DeviceScalar)
// on the normal equations (J^T*J) arising from bundle adjustment problems.
//
// Reads a BAL (Bundle Adjustment in the Large) format file, computes the
// Jacobian and residual, forms the normal equations H = J^T*J + lambda*I,
// then solves H*dx = -J^T*r with both CPU and GPU conjugate gradients.
//
// BAL format: http://grail.cs.washington.edu/projects/bal/
//
// Usage:
// cmake --build build-bench-gpu --target bench_gpu_ba
//
// # Download a BAL dataset (bz2-compressed):
// wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-49-7776-pre.txt.bz2
// bunzip2 problem-49-7776-pre.txt.bz2
//
// # Run on a specific problem:
// BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba
//
// # Append results to the log:
// BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba \
// --benchmark_format=console 2>&1 | tee -a benchmarks/GPU/ba_results.log
#include <benchmark/benchmark.h>
#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
#include <Eigen/GPU>
#include <cmath>
#include <cstdio>
#include <fstream>
#include <string>
#include <vector>
using namespace Eigen;
// ============================================================================
// BAL problem data
// ============================================================================
struct BALProblem {
int num_cameras = 0;
int num_points = 0;
int num_observations = 0;
// Observations: (camera_idx, point_idx, observed_x, observed_y).
std::vector<int> camera_index;
std::vector<int> point_index;
std::vector<double> observations_x;
std::vector<double> observations_y;
// Camera parameters: 9 per camera (Rodrigues r[3], translation t[3], f, k1, k2).
std::vector<double> cameras; // [num_cameras * 9]
// 3D points: 3 per point.
std::vector<double> points; // [num_points * 3]
const double* camera(int i) const { return &cameras[i * 9]; }
const double* point(int i) const { return &points[i * 3]; }
bool load(const std::string& filename) {
std::ifstream in(filename);
if (!in) {
fprintf(stderr, "ERROR: Cannot open BAL file: %s\n", filename.c_str());
return false;
}
in >> num_cameras >> num_points >> num_observations;
if (!in || num_cameras <= 0 || num_points <= 0 || num_observations <= 0) {
fprintf(stderr, "ERROR: Invalid BAL header in %s\n", filename.c_str());
return false;
}
camera_index.resize(num_observations);
point_index.resize(num_observations);
observations_x.resize(num_observations);
observations_y.resize(num_observations);
for (int i = 0; i < num_observations; ++i) {
in >> camera_index[i] >> point_index[i] >> observations_x[i] >> observations_y[i];
}
cameras.resize(num_cameras * 9);
for (int i = 0; i < num_cameras * 9; ++i) {
in >> cameras[i];
}
points.resize(num_points * 3);
for (int i = 0; i < num_points * 3; ++i) {
in >> points[i];
}
if (!in) {
fprintf(stderr, "ERROR: Truncated BAL file: %s\n", filename.c_str());
return false;
}
fprintf(stderr, "Loaded BAL: %d cameras, %d points, %d observations\n", num_cameras, num_points, num_observations);
return true;
}
};
// ============================================================================
// Camera projection model (BAL convention)
// ============================================================================
// Rodrigues rotation: rotate point X by axis-angle vector omega.
static void rodrigues_rotate(const double* omega, const double* X, double* result) {
double theta2 = omega[0] * omega[0] + omega[1] * omega[1] + omega[2] * omega[2];
if (theta2 > 1e-30) {
double theta = std::sqrt(theta2);
double costh = std::cos(theta);
double sinth = std::sin(theta);
double k = (1.0 - costh) / theta2;
// Cross product omega x X.
double wx = omega[1] * X[2] - omega[2] * X[1];
double wy = omega[2] * X[0] - omega[0] * X[2];
double wz = omega[0] * X[1] - omega[1] * X[0];
// Dot product omega . X.
double dot = omega[0] * X[0] + omega[1] * X[1] + omega[2] * X[2];
result[0] = X[0] * costh + wx * (sinth / theta) + omega[0] * dot * k;
result[1] = X[1] * costh + wy * (sinth / theta) + omega[1] * dot * k;
result[2] = X[2] * costh + wz * (sinth / theta) + omega[2] * dot * k;
} else {
// Small angle: R ≈ I + [omega]×.
result[0] = X[0] + omega[1] * X[2] - omega[2] * X[1];
result[1] = X[1] + omega[2] * X[0] - omega[0] * X[2];
result[2] = X[2] + omega[0] * X[1] - omega[1] * X[0];
}
}
// Project a 3D point through a camera, returning the 2D residual.
// camera: [r0,r1,r2, t0,t1,t2, f, k1, k2]
// point: [X, Y, Z]
// observed: [ox, oy]
// residual: [rx, ry] = projected - observed
static void project(const double* camera, const double* point, const double* observed, double* residual) {
// Rotate.
double P[3];
rodrigues_rotate(camera, point, P);
// Translate.
P[0] += camera[3];
P[1] += camera[4];
P[2] += camera[5];
// Normalize (BAL convention: negative z).
double xp = -P[0] / P[2];
double yp = -P[1] / P[2];
// Radial distortion.
double r2 = xp * xp + yp * yp;
double distortion = 1.0 + camera[7] * r2 + camera[8] * r2 * r2;
// Apply focal length.
double predicted_x = camera[6] * distortion * xp;
double predicted_y = camera[6] * distortion * yp;
residual[0] = predicted_x - observed[0];
residual[1] = predicted_y - observed[1];
}
// ============================================================================
// Jacobian computation (numerical differentiation)
// ============================================================================
// Compute the 2x9 Jacobian block w.r.t. camera params and 2x3 block w.r.t.
// point coords for a single observation, using central finite differences.
static void compute_jacobian_block(const double* camera, const double* point, const double* observed,
double* J_cam, // 2x9, row-major
double* J_point) // 2x3, row-major
{
constexpr double eps = 1e-8;
// Camera parameters (9).
double cam_pert[9];
std::copy(camera, camera + 9, cam_pert);
for (int j = 0; j < 9; ++j) {
double orig = cam_pert[j];
double rp[2], rm[2];
cam_pert[j] = orig + eps;
project(cam_pert, point, observed, rp);
cam_pert[j] = orig - eps;
project(cam_pert, point, observed, rm);
cam_pert[j] = orig;
J_cam[0 * 9 + j] = (rp[0] - rm[0]) / (2.0 * eps);
J_cam[1 * 9 + j] = (rp[1] - rm[1]) / (2.0 * eps);
}
// Point coordinates (3).
double pt_pert[3];
std::copy(point, point + 3, pt_pert);
for (int j = 0; j < 3; ++j) {
double orig = pt_pert[j];
double rp[2], rm[2];
pt_pert[j] = orig + eps;
project(camera, pt_pert, observed, rp);
pt_pert[j] = orig - eps;
project(camera, pt_pert, observed, rm);
pt_pert[j] = orig;
J_point[0 * 3 + j] = (rp[0] - rm[0]) / (2.0 * eps);
J_point[1 * 3 + j] = (rp[1] - rm[1]) / (2.0 * eps);
}
}
// ============================================================================
// Build normal equations: H = J^T*J + lambda*I, g = -J^T*r
// ============================================================================
struct NormalEquations {
SparseMatrix<double, ColMajor, int> H;
VectorXd g;
VectorXd residual;
double residual_norm;
int jacobian_rows;
int jacobian_cols;
long jacobian_nnz;
};
static NormalEquations build_normal_equations(const BALProblem& problem, double lambda = 1.0) {
const int num_cam_params = problem.num_cameras * 9;
const int num_pt_params = problem.num_points * 3;
const int num_params = num_cam_params + num_pt_params;
const int num_residuals = problem.num_observations * 2;
fprintf(stderr, "Building Jacobian: %d x %d, %ld nonzeros\n", num_residuals, num_params,
(long)problem.num_observations * 24);
// Build J as a triplet list.
using Triplet = Eigen::Triplet<double>;
std::vector<Triplet> triplets;
triplets.reserve(problem.num_observations * 24); // 2 rows × 12 nonzeros = 24 entries per obs
VectorXd residual(num_residuals);
for (int obs = 0; obs < problem.num_observations; ++obs) {
int ci = problem.camera_index[obs];
int pi = problem.point_index[obs];
double observed[2] = {problem.observations_x[obs], problem.observations_y[obs]};
// Compute residual.
double r[2];
project(problem.camera(ci), problem.point(pi), observed, r);
residual[obs * 2 + 0] = r[0];
residual[obs * 2 + 1] = r[1];
// Compute Jacobian blocks.
double J_cam[18], J_pt[6]; // 2x9 and 2x3
compute_jacobian_block(problem.camera(ci), problem.point(pi), observed, J_cam, J_pt);
// Insert camera block: rows [2*obs, 2*obs+1], cols [9*ci, 9*ci+8].
for (int row = 0; row < 2; ++row) {
for (int col = 0; col < 9; ++col) {
double val = J_cam[row * 9 + col];
if (val != 0.0) {
triplets.emplace_back(obs * 2 + row, ci * 9 + col, val);
}
}
}
// Insert point block: rows [2*obs, 2*obs+1], cols [num_cam_params + 3*pi, ...].
for (int row = 0; row < 2; ++row) {
for (int col = 0; col < 3; ++col) {
double val = J_pt[row * 3 + col];
if (val != 0.0) {
triplets.emplace_back(obs * 2 + row, num_cam_params + pi * 3 + col, val);
}
}
}
}
// Build sparse Jacobian.
SparseMatrix<double, ColMajor, int> J(num_residuals, num_params);
J.setFromTriplets(triplets.begin(), triplets.end());
fprintf(stderr, "Jacobian: %dx%d, nnz=%ld\n", (int)J.rows(), (int)J.cols(), (long)J.nonZeros());
// Form normal equations: H = J^T*J + lambda*I.
SparseMatrix<double, ColMajor, int> H = (J.transpose() * J).pruned();
// Add Levenberg-Marquardt damping.
for (int i = 0; i < num_params; ++i) {
H.coeffRef(i, i) += lambda;
}
H.makeCompressed();
// Gradient: g = -J^T * r.
VectorXd g = -(J.transpose() * residual);
double rnorm = residual.norm();
fprintf(stderr, "Normal equations: H is %dx%d, nnz=%ld, |r|=%.6e\n", (int)H.rows(), (int)H.cols(), (long)H.nonZeros(),
rnorm);
return {std::move(H), std::move(g), std::move(residual), rnorm, num_residuals, num_params, (long)J.nonZeros()};
}
// ============================================================================
// Global problem state (loaded once before benchmarks run)
// ============================================================================
static BALProblem g_problem;
static NormalEquations g_neq;
static bool g_loaded = false;
static void ensure_loaded() {
if (g_loaded) return;
const char* bal_file = std::getenv("BAL_FILE");
if (!bal_file) {
fprintf(stderr,
"ERROR: Set BAL_FILE environment variable to a BAL problem file.\n"
" Download from: http://grail.cs.washington.edu/projects/bal/\n"
" Example:\n"
" wget http://grail.cs.washington.edu/projects/bal/data/ladybug/"
"problem-49-7776-pre.txt.bz2\n"
" bunzip2 problem-49-7776-pre.txt.bz2\n"
" BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba\n");
std::exit(1);
}
if (!g_problem.load(bal_file)) {
std::exit(1);
}
g_neq = build_normal_equations(g_problem);
g_loaded = true;
}
// ============================================================================
// CPU CG benchmark
// ============================================================================
static void BM_BA_CPU_CG(benchmark::State& state) {
ensure_loaded();
const auto& H = g_neq.H;
const auto& g = g_neq.g;
ConjugateGradient<SparseMatrix<double, ColMajor, int>, Lower | Upper> cg;
cg.setMaxIterations(10000);
cg.setTolerance(1e-8);
cg.compute(H);
int last_iters = 0;
double last_error = 0;
for (auto _ : state) {
VectorXd dx = cg.solve(g);
benchmark::DoNotOptimize(dx.data());
last_iters = cg.iterations();
last_error = cg.error();
}
state.counters["n"] = H.rows();
state.counters["nnz"] = H.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
state.counters["cameras"] = g_problem.num_cameras;
state.counters["points"] = g_problem.num_points;
state.counters["observations"] = g_problem.num_observations;
}
// ============================================================================
// GPU CG benchmark (with Jacobi preconditioner)
// ============================================================================
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 1);
cudaFree(p);
done = true;
}
}
static void BM_BA_GPU_CG(benchmark::State& state) {
ensure_loaded();
cuda_warmup();
const auto& H = g_neq.H;
const auto& g = g_neq.g;
const Index n = H.rows();
// Extract inverse diagonal (Jacobi preconditioner).
using SpMat = SparseMatrix<double, ColMajor, int>;
VectorXd invdiag(n);
for (Index j = 0; j < H.outerSize(); ++j) {
SpMat::InnerIterator it(H, j);
while (it && it.index() != j) ++it;
if (it && it.index() == j && it.value() != 0.0)
invdiag(j) = 1.0 / it.value();
else
invdiag(j) = 1.0;
}
// Set up GPU context and upload data.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<double> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(H);
auto d_invdiag = DeviceMatrix<double>::fromHost(invdiag, ctx.stream());
auto d_g = DeviceMatrix<double>::fromHost(g, ctx.stream());
int last_iters = 0;
double last_error = 0;
for (auto _ : state) {
DeviceMatrix<double> d_x(n, 1);
d_x.setZero(ctx);
DeviceMatrix<double> residual(n, 1);
residual.copyFrom(ctx, d_g);
double rhsNorm2 = d_g.squaredNorm(ctx);
double threshold = 1e-8 * 1e-8 * rhsNorm2;
double residualNorm2 = residual.squaredNorm(ctx);
DeviceMatrix<double> p = d_invdiag.cwiseProduct(ctx, residual);
DeviceMatrix<double> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index i = 0;
Index maxIters = 10000;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
z.cwiseProduct(ctx, d_invdiag, residual); // in-place, no allocation
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= beta; // device-pointer scal, no host sync
p += z;
i++;
}
benchmark::DoNotOptimize(d_x.data());
last_iters = i;
last_error = std::sqrt(residualNorm2 / rhsNorm2);
}
GpuContext::setThreadLocal(nullptr);
state.counters["n"] = n;
state.counters["nnz"] = H.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
state.counters["cameras"] = g_problem.num_cameras;
state.counters["points"] = g_problem.num_points;
state.counters["observations"] = g_problem.num_observations;
}
// ============================================================================
// CPU CG with Jacobi preconditioner (apples-to-apples comparison)
// ============================================================================
static void BM_BA_CPU_CG_Jacobi(benchmark::State& state) {
ensure_loaded();
const auto& H = g_neq.H;
const auto& g = g_neq.g;
// Eigen's DiagonalPreconditioner is effectively Jacobi.
ConjugateGradient<SparseMatrix<double, ColMajor, int>, Lower | Upper> cg;
cg.setMaxIterations(10000);
cg.setTolerance(1e-8);
cg.compute(H);
int last_iters = 0;
double last_error = 0;
for (auto _ : state) {
VectorXd dx = cg.solve(g);
benchmark::DoNotOptimize(dx.data());
last_iters = cg.iterations();
last_error = cg.error();
}
state.counters["n"] = H.rows();
state.counters["nnz"] = H.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
}
// ============================================================================
// Register benchmarks
// ============================================================================
BENCHMARK(BM_BA_CPU_CG)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_BA_CPU_CG_Jacobi)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_BA_GPU_CG)->Unit(benchmark::kMillisecond);
// ============================================================================
// Custom main: print summary after benchmarks
// ============================================================================
int main(int argc, char** argv) {
benchmark::Initialize(&argc, argv);
// Print problem info before benchmarks.
const char* bal_file = std::getenv("BAL_FILE");
if (bal_file) {
ensure_loaded();
fprintf(stderr,
"\n"
"=== Bundle Adjustment GPU CG Benchmark ===\n"
"BAL file: %s\n"
"Cameras: %d\n"
"Points: %d\n"
"Observations: %d\n"
"J size: %d x %d, nnz=%ld\n"
"H size: %d x %d, nnz=%ld\n"
"|residual|: %.6e\n"
"==========================================\n\n",
bal_file, g_problem.num_cameras, g_problem.num_points, g_problem.num_observations, g_neq.jacobian_rows,
g_neq.jacobian_cols, g_neq.jacobian_nnz, (int)g_neq.H.rows(), (int)g_neq.H.cols(), (long)g_neq.H.nonZeros(),
g_neq.residual_norm);
}
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();
return 0;
}

View File

@@ -0,0 +1,291 @@
// Benchmark: GPU Conjugate Gradient via DeviceMatrix operators.
//
// Shows the path to running Eigen's CG on GPU with minimal code changes.
// The DeviceMatrix benchmark mirrors Eigen's conjugate_gradient() line-by-line.
// A raw cuBLAS device-pointer-mode implementation is included as a lower bound.
//
// The only change needed in Eigen's CG template to support DeviceMatrix:
// Line 34: typedef Dest VectorType; (instead of Matrix<Scalar, Dynamic, 1>)
//
// Usage:
// cmake --build build-bench-gpu --target bench_gpu_cg_sync
// ./build-bench-gpu/bench_gpu_cg_sync
#include <benchmark/benchmark.h>
#include <Eigen/Sparse>
#include <Eigen/GPU>
#include <cusparse.h>
using namespace Eigen;
using Scalar = double;
using RealScalar = double;
using Vec = Matrix<Scalar, Dynamic, 1>;
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
static SpMat make_spd(Index n) {
SpMat A(n, n);
A.reserve(VectorXi::Constant(n, 3));
for (Index i = 0; i < n; ++i) {
A.insert(i, i) = 4.0;
if (i > 0) A.insert(i, i - 1) = -1.0;
if (i < n - 1) A.insert(i, i + 1) = -1.0;
}
A.makeCompressed();
return A;
}
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 1);
cudaFree(p);
done = true;
}
}
// ==========================================================================
// GPU CG using DeviceMatrix operators — mirrors Eigen's conjugate_gradient()
// ==========================================================================
//
// Compare with Eigen/src/IterativeLinearSolvers/ConjugateGradient.h lines 29-84.
// Left column: Eigen CG code. Right column: this benchmark.
//
// Eigen CG GPU CG (this benchmark)
// -------- -----------------------
// VectorType residual = rhs - mat * x; residual.copyFrom(ctx, rhs); [x=0 so r=b]
// RealScalar rhsNorm2 = rhs.sqNorm(); RealScalar rhsNorm2 = rhs.squaredNorm();
// ...
// tmp.noalias() = mat * p; tmp.noalias() = mat * p; [identical]
// Scalar alpha = absNew / p.dot(tmp); Scalar alpha = absNew / p.dot(tmp); [identical]
// x += alpha * p; x += alpha * p; [identical]
// residual -= alpha * tmp; residual -= alpha * tmp; [identical]
// residualNorm2 = residual.sqNorm(); residualNorm2 = residual.squaredNorm(); [identical]
// ...
// p = z + beta * p; p *= beta; p += z; [equivalent, no alloc]
static void BM_CG_DeviceMatrixOps(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
SpMat A = make_spd(n);
Vec b = Vec::Random(n);
// One shared context: SpMV + BLAS-1 on same stream, zero event overhead.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<Scalar> spmv(ctx);
auto mat = spmv.deviceView(A);
// Upload RHS once.
auto rhs = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
for (auto _ : state) {
// --- Eigen CG lines 34-63: initialization ---
// typedef Dest VectorType; // GPU CHANGE: was Matrix<Scalar,Dynamic,1>
// VectorType residual = rhs - mat * x; // x=0, so residual = rhs
DeviceMatrix<Scalar> x(n, 1);
x.setZero();
DeviceMatrix<Scalar> residual(n, 1);
residual.copyFrom(ctx, rhs);
// RealScalar rhsNorm2 = rhs.squaredNorm();
RealScalar rhsNorm2 = rhs.squaredNorm();
if (rhsNorm2 == 0) continue;
RealScalar tol = 1e-10;
const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
RealScalar threshold = numext::maxi(RealScalar(tol * tol * rhsNorm2), considerAsZero);
// RealScalar residualNorm2 = residual.squaredNorm();
RealScalar residualNorm2 = residual.squaredNorm();
if (residualNorm2 < threshold) continue;
// VectorType p(n);
// p = precond.solve(residual); // no preconditioner: p = residual
DeviceMatrix<Scalar> p(n, 1);
p.copyFrom(ctx, residual);
// VectorType z(n), tmp(n);
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
// auto absNew = numext::real(residual.dot(p));
// DeviceScalar — stays on device, no sync.
auto absNew = residual.dot(p); // DeviceScalar, no sync
// while (i < maxIters) {
Index maxIters = 200;
Index i = 0;
while (i < maxIters) {
// tmp.noalias() = mat * p;
tmp.noalias() = mat * p; // SpMV, device-resident
// auto alpha = absNew / p.dot(tmp);
// DeviceScalar / DeviceScalar → device kernel, no sync!
auto alpha = absNew / p.dot(tmp); // DeviceScalar, no sync
// x += alpha * p;
// DeviceScalar * DeviceMatrix → device-pointer axpy, no sync!
x += alpha * p;
// residual -= alpha * tmp;
residual -= alpha * tmp; // device-pointer axpy, no sync
// residualNorm2 = residual.squaredNorm();
residualNorm2 = residual.squaredNorm(); // THE one sync per iteration
// if (residualNorm2 < threshold) break;
if (residualNorm2 < threshold) break;
// z = precond.solve(residual);
z.copyFrom(ctx, residual); // no preconditioner
// auto absOld = std::move(absNew);
auto absOld = std::move(absNew); // no sync, no alloc
// absNew = numext::real(residual.dot(z));
absNew = residual.dot(z); // DeviceScalar, no sync
// auto beta = absNew / absOld;
// DeviceScalar / DeviceScalar → device kernel, no sync!
auto beta = absNew / absOld; // DeviceScalar, no sync
// p = z + beta * p;
p *= beta; // device-pointer scal, no host sync
p += z;
i++;
}
}
GpuContext::setThreadLocal(nullptr);
state.SetItemsProcessed(state.iterations() * 200);
}
BENCHMARK(BM_CG_DeviceMatrixOps)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
// ==========================================================================
// Raw cuBLAS device-pointer-mode CG (1 sync/iter) — performance lower bound
// ==========================================================================
__global__ void scalar_div_kernel(const Scalar* a, const Scalar* b, Scalar* out) { *out = *a / *b; }
__global__ void scalar_neg_kernel(const Scalar* in, Scalar* out) { *out = -(*in); }
static void BM_CG_DevicePointerMode(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
const int maxIters = 200;
SpMat A = make_spd(n);
Vec b = Vec::Random(n);
cudaStream_t stream;
cudaStreamCreate(&stream);
cublasHandle_t cublas;
cublasCreate(&cublas);
cublasSetStream(cublas, stream);
cusparseHandle_t cusparse;
cusparseCreate(&cusparse);
cusparseSetStream(cusparse, stream);
internal::DeviceBuffer d_outer((n + 1) * sizeof(int));
internal::DeviceBuffer d_inner(A.nonZeros() * sizeof(int));
internal::DeviceBuffer d_vals(A.nonZeros() * sizeof(Scalar));
cudaMemcpy(d_outer.ptr, A.outerIndexPtr(), (n + 1) * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_inner.ptr, A.innerIndexPtr(), A.nonZeros() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_vals.ptr, A.valuePtr(), A.nonZeros() * sizeof(Scalar), cudaMemcpyHostToDevice);
cusparseSpMatDescr_t matA;
cusparseCreateCsc(&matA, n, n, A.nonZeros(), d_outer.ptr, d_inner.ptr, d_vals.ptr, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
internal::DeviceBuffer d_tmp_buf(n * sizeof(Scalar));
cusparseDnVecDescr_t tmp_x, tmp_y;
cusparseCreateDnVec(&tmp_x, n, d_tmp_buf.ptr, CUDA_R_64F);
cusparseCreateDnVec(&tmp_y, n, d_tmp_buf.ptr, CUDA_R_64F);
Scalar spmv_alpha = 1.0, spmv_beta = 0.0;
size_t ws_size = 0;
cusparseSpMV_bufferSize(cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &spmv_alpha, matA, tmp_x, &spmv_beta, tmp_y,
CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &ws_size);
internal::DeviceBuffer d_workspace(ws_size);
cusparseDestroyDnVec(tmp_x);
cusparseDestroyDnVec(tmp_y);
internal::DeviceBuffer d_x(n * sizeof(Scalar)), d_r(n * sizeof(Scalar));
internal::DeviceBuffer d_p(n * sizeof(Scalar)), d_tmp(n * sizeof(Scalar));
internal::DeviceBuffer d_b(n * sizeof(Scalar));
internal::DeviceBuffer d_absNew(sizeof(Scalar)), d_absOld(sizeof(Scalar));
internal::DeviceBuffer d_pdot(sizeof(Scalar)), d_alpha(sizeof(Scalar));
internal::DeviceBuffer d_neg_alpha(sizeof(Scalar)), d_beta(sizeof(Scalar));
internal::DeviceBuffer d_rnorm(sizeof(RealScalar));
cudaMemcpy(d_b.ptr, b.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice);
auto spmv = [&](Scalar* x_ptr, Scalar* y_ptr) {
cusparseDnVecDescr_t vx, vy;
cusparseCreateDnVec(&vx, n, x_ptr, CUDA_R_64F);
cusparseCreateDnVec(&vy, n, y_ptr, CUDA_R_64F);
cusparseSpMV(cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &spmv_alpha, matA, vx, &spmv_beta, vy, CUDA_R_64F,
CUSPARSE_SPMV_ALG_DEFAULT, d_workspace.ptr);
cusparseDestroyDnVec(vx);
cusparseDestroyDnVec(vy);
};
for (auto _ : state) {
cudaMemsetAsync(static_cast<Scalar*>(d_x.ptr), 0, n * sizeof(Scalar), stream);
cudaMemcpyAsync(d_r.ptr, d_b.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
cudaMemcpyAsync(d_p.ptr, d_b.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_DEVICE);
cublasDdot(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_p.ptr), 1,
static_cast<Scalar*>(d_absNew.ptr));
for (int i = 0; i < maxIters; ++i) {
spmv(static_cast<Scalar*>(d_p.ptr), static_cast<Scalar*>(d_tmp.ptr));
cublasDdot(cublas, n, static_cast<Scalar*>(d_p.ptr), 1, static_cast<Scalar*>(d_tmp.ptr), 1,
static_cast<Scalar*>(d_pdot.ptr));
scalar_div_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_absNew.ptr), static_cast<Scalar*>(d_pdot.ptr),
static_cast<Scalar*>(d_alpha.ptr));
scalar_neg_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_alpha.ptr), static_cast<Scalar*>(d_neg_alpha.ptr));
cublasDaxpy(cublas, n, static_cast<Scalar*>(d_alpha.ptr), static_cast<Scalar*>(d_p.ptr), 1,
static_cast<Scalar*>(d_x.ptr), 1);
cublasDaxpy(cublas, n, static_cast<Scalar*>(d_neg_alpha.ptr), static_cast<Scalar*>(d_tmp.ptr), 1,
static_cast<Scalar*>(d_r.ptr), 1);
cublasDnrm2(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<RealScalar*>(d_rnorm.ptr));
RealScalar rnorm;
cudaMemcpyAsync(&rnorm, d_rnorm.ptr, sizeof(RealScalar), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
if (rnorm * rnorm < 1e-20) break;
cudaMemcpyAsync(d_absOld.ptr, d_absNew.ptr, sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
cublasDdot(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_r.ptr), 1,
static_cast<Scalar*>(d_absNew.ptr));
scalar_div_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_absNew.ptr), static_cast<Scalar*>(d_absOld.ptr),
static_cast<Scalar*>(d_beta.ptr));
cublasDscal(cublas, n, static_cast<Scalar*>(d_beta.ptr), static_cast<Scalar*>(d_p.ptr), 1);
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_HOST);
Scalar one = 1.0;
cublasDaxpy(cublas, n, &one, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_p.ptr), 1);
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_DEVICE);
}
cudaStreamSynchronize(stream);
}
state.SetItemsProcessed(state.iterations() * maxIters);
cusparseDestroySpMat(matA);
cusparseDestroy(cusparse);
cublasDestroy(cublas);
cudaStreamDestroy(stream);
}
BENCHMARK(BM_CG_DevicePointerMode)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);

View File

@@ -0,0 +1,216 @@
// Benchmark: GPU CG vs CPU CG on realistic sparse systems.
//
// Tests 2D Laplacian (5-point stencil) and 3D Laplacian (7-point stencil)
// in both float and double precision.
//
// Usage:
// cmake --build build-bench-gpu --target bench_gpu_cg_vs_cpu
// ./build-bench-gpu/bench_gpu_cg_vs_cpu
#include <benchmark/benchmark.h>
#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Sparse matrix generators -----------------------------------------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_laplacian_2d(int grid_n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
const int n = grid_n * grid_n;
SpMat A(n, n);
A.reserve(VectorXi::Constant(n, 5));
for (int i = 0; i < grid_n; ++i) {
for (int j = 0; j < grid_n; ++j) {
int idx = i * grid_n + j;
A.insert(idx, idx) = Scalar(4);
if (i > 0) A.insert(idx, idx - grid_n) = Scalar(-1);
if (i < grid_n - 1) A.insert(idx, idx + grid_n) = Scalar(-1);
if (j > 0) A.insert(idx, idx - 1) = Scalar(-1);
if (j < grid_n - 1) A.insert(idx, idx + 1) = Scalar(-1);
}
}
A.makeCompressed();
return A;
}
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_laplacian_3d(int grid_n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
const int n = grid_n * grid_n * grid_n;
const int n2 = grid_n * grid_n;
SpMat A(n, n);
A.reserve(VectorXi::Constant(n, 7));
for (int i = 0; i < grid_n; ++i) {
for (int j = 0; j < grid_n; ++j) {
for (int k = 0; k < grid_n; ++k) {
int idx = i * n2 + j * grid_n + k;
A.insert(idx, idx) = Scalar(6);
if (i > 0) A.insert(idx, idx - n2) = Scalar(-1);
if (i < grid_n - 1) A.insert(idx, idx + n2) = Scalar(-1);
if (j > 0) A.insert(idx, idx - grid_n) = Scalar(-1);
if (j < grid_n - 1) A.insert(idx, idx + grid_n) = Scalar(-1);
if (k > 0) A.insert(idx, idx - 1) = Scalar(-1);
if (k < grid_n - 1) A.insert(idx, idx + 1) = Scalar(-1);
}
}
}
A.makeCompressed();
return A;
}
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 1);
cudaFree(p);
done = true;
}
}
// ---- CPU CG -----------------------------------------------------------------
template <typename Scalar, typename MatGen>
void run_cpu_cg(benchmark::State& state, MatGen make_matrix) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
const int grid_n = state.range(0);
SpMat A = make_matrix(grid_n);
Vec b = Vec::Random(A.rows());
ConjugateGradient<SpMat, Lower | Upper> cg;
cg.setMaxIterations(10000);
cg.setTolerance(RealScalar(1e-8));
cg.compute(A);
int last_iters = 0;
for (auto _ : state) {
Vec x = cg.solve(b);
benchmark::DoNotOptimize(x.data());
last_iters = cg.iterations();
}
state.counters["n"] = A.rows();
state.counters["nnz"] = A.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = cg.error();
}
// ---- GPU CG -----------------------------------------------------------------
template <typename Scalar, typename MatGen>
void run_gpu_cg(benchmark::State& state, MatGen make_matrix) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
cuda_warmup();
const int grid_n = state.range(0);
SpMat A = make_matrix(grid_n);
const Index n = A.rows();
Vec b = Vec::Random(n);
// Extract inverse diagonal.
Vec invdiag(n);
for (Index j = 0; j < A.outerSize(); ++j) {
typename SpMat::InnerIterator it(A, j);
while (it && it.index() != j) ++it;
if (it && it.index() == j && it.value() != Scalar(0))
invdiag(j) = Scalar(1) / it.value();
else
invdiag(j) = Scalar(1);
}
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<Scalar> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(A);
auto d_invdiag = DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
int last_iters = 0;
RealScalar last_error = 0;
for (auto _ : state) {
DeviceMatrix<Scalar> d_x(n, 1);
d_x.setZero(ctx);
DeviceMatrix<Scalar> residual(n, 1);
residual.copyFrom(ctx, d_b);
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
RealScalar tol = RealScalar(1e-8);
RealScalar threshold = tol * tol * rhsNorm2;
RealScalar residualNorm2 = residual.squaredNorm(ctx);
DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index i = 0;
Index maxIters = 10000;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
z.cwiseProduct(ctx, d_invdiag, residual);
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= beta;
p += z;
i++;
}
benchmark::DoNotOptimize(d_x.data());
last_iters = i;
last_error = numext::sqrt(residualNorm2 / rhsNorm2);
}
GpuContext::setThreadLocal(nullptr);
state.counters["n"] = n;
state.counters["nnz"] = A.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
}
// ---- 2D Laplacian, double ---------------------------------------------------
static void BM_CG_CPU_2D_double(benchmark::State& state) { run_cpu_cg<double>(state, make_laplacian_2d<double>); }
static void BM_CG_GPU_2D_double(benchmark::State& state) { run_gpu_cg<double>(state, make_laplacian_2d<double>); }
BENCHMARK(BM_CG_CPU_2D_double)->ArgsProduct({{32, 64, 128, 256, 512}});
BENCHMARK(BM_CG_GPU_2D_double)->ArgsProduct({{32, 64, 128, 256, 512}});
// ---- 2D Laplacian, float ----------------------------------------------------
static void BM_CG_CPU_2D_float(benchmark::State& state) { run_cpu_cg<float>(state, make_laplacian_2d<float>); }
static void BM_CG_GPU_2D_float(benchmark::State& state) { run_gpu_cg<float>(state, make_laplacian_2d<float>); }
BENCHMARK(BM_CG_CPU_2D_float)->ArgsProduct({{32, 64, 128, 256, 512}});
BENCHMARK(BM_CG_GPU_2D_float)->ArgsProduct({{32, 64, 128, 256, 512}});
// ---- 3D Laplacian, double ---------------------------------------------------
static void BM_CG_CPU_3D_double(benchmark::State& state) { run_cpu_cg<double>(state, make_laplacian_3d<double>); }
static void BM_CG_GPU_3D_double(benchmark::State& state) { run_gpu_cg<double>(state, make_laplacian_3d<double>); }
BENCHMARK(BM_CG_CPU_3D_double)->ArgsProduct({{16, 32, 48, 64}});
BENCHMARK(BM_CG_GPU_3D_double)->ArgsProduct({{16, 32, 48, 64}});
// ---- 3D Laplacian, float ----------------------------------------------------
static void BM_CG_CPU_3D_float(benchmark::State& state) { run_cpu_cg<float>(state, make_laplacian_3d<float>); }
static void BM_CG_GPU_3D_float(benchmark::State& state) { run_gpu_cg<float>(state, make_laplacian_3d<float>); }
BENCHMARK(BM_CG_CPU_3D_float)->ArgsProduct({{16, 32, 48, 64}});
BENCHMARK(BM_CG_GPU_3D_float)->ArgsProduct({{16, 32, 48, 64}});

View File

@@ -0,0 +1,185 @@
// GPU FFT benchmarks: GpuFFT 1D and 2D throughput.
//
// Measures forward and inverse FFT performance across a range of sizes,
// including plan-amortized (reuse) and cold-start (new plan) scenarios.
//
// Usage:
// cmake --build build-bench-gpu --target bench_gpu_fft
// ./build-bench-gpu/bench_gpu_fft
//
// Profiling:
// nsys profile --trace=cuda ./build-bench-gpu/bench_gpu_fft
#include <benchmark/benchmark.h>
#include <Eigen/GPU>
using namespace Eigen;
#ifndef SCALAR
#define SCALAR float
#endif
using Scalar = SCALAR;
using Complex = std::complex<Scalar>;
using CVec = Matrix<Complex, Dynamic, 1>;
using RVec = Matrix<Scalar, Dynamic, 1>;
using CMat = Matrix<Complex, Dynamic, Dynamic>;
// CUDA warm-up: ensure the GPU is initialized before timing.
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 1);
cudaFree(p);
done = true;
}
}
// --------------------------------------------------------------------------
// 1D C2C Forward
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_C2C_Fwd(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
CVec x = CVec::Random(n);
GpuFFT<Scalar> fft;
// Warm up plan.
CVec tmp = fft.fwd(x);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.fwd(x));
}
state.SetItemsProcessed(state.iterations() * n);
state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2); // read + write
}
BENCHMARK(BM_GpuFFT_1D_C2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
// --------------------------------------------------------------------------
// 1D C2C Inverse
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_C2C_Inv(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
CVec x = CVec::Random(n);
GpuFFT<Scalar> fft;
CVec X = fft.fwd(x);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.inv(X));
}
state.SetItemsProcessed(state.iterations() * n);
state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);
}
BENCHMARK(BM_GpuFFT_1D_C2C_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
// --------------------------------------------------------------------------
// 1D R2C Forward
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_R2C_Fwd(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
RVec r = RVec::Random(n);
GpuFFT<Scalar> fft;
// Warm up plan.
CVec tmp = fft.fwd(r);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.fwd(r));
}
state.SetItemsProcessed(state.iterations() * n);
state.SetBytesProcessed(state.iterations() * (n * sizeof(Scalar) + (n / 2 + 1) * sizeof(Complex)));
}
BENCHMARK(BM_GpuFFT_1D_R2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
// --------------------------------------------------------------------------
// 1D C2R Inverse
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_C2R_Inv(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
RVec r = RVec::Random(n);
GpuFFT<Scalar> fft;
CVec R = fft.fwd(r);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.invReal(R, n));
}
state.SetItemsProcessed(state.iterations() * n);
state.SetBytesProcessed(state.iterations() * ((n / 2 + 1) * sizeof(Complex) + n * sizeof(Scalar)));
}
BENCHMARK(BM_GpuFFT_1D_C2R_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
// --------------------------------------------------------------------------
// 2D C2C Forward
// --------------------------------------------------------------------------
static void BM_GpuFFT_2D_C2C_Fwd(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0); // square n x n
CMat A = CMat::Random(n, n);
GpuFFT<Scalar> fft;
// Warm up plan.
CMat tmp = fft.fwd2d(A);
for (auto _ : state) {
benchmark::DoNotOptimize(fft.fwd2d(A));
}
state.SetItemsProcessed(state.iterations() * n * n);
state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 2);
}
BENCHMARK(BM_GpuFFT_2D_C2C_Fwd)->RangeMultiplier(2)->Range(64, 4096);
// --------------------------------------------------------------------------
// 2D C2C Roundtrip (fwd + inv)
// --------------------------------------------------------------------------
static void BM_GpuFFT_2D_C2C_Roundtrip(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
CMat A = CMat::Random(n, n);
GpuFFT<Scalar> fft;
// Warm up plans.
CMat tmp = fft.inv2d(fft.fwd2d(A));
for (auto _ : state) {
CMat B = fft.fwd2d(A);
benchmark::DoNotOptimize(fft.inv2d(B));
}
state.SetItemsProcessed(state.iterations() * n * n * 2); // fwd + inv
state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 4);
}
BENCHMARK(BM_GpuFFT_2D_C2C_Roundtrip)->RangeMultiplier(2)->Range(64, 4096);
// --------------------------------------------------------------------------
// 1D Cold start (includes plan creation)
// --------------------------------------------------------------------------
static void BM_GpuFFT_1D_ColdStart(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
CVec x = CVec::Random(n);
for (auto _ : state) {
GpuFFT<Scalar> fft; // new object = new plans
benchmark::DoNotOptimize(fft.fwd(x));
}
state.SetItemsProcessed(state.iterations() * n);
}
BENCHMARK(BM_GpuFFT_1D_ColdStart)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);

View File

@@ -481,13 +481,13 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
ei_add_test(gpu_basic)
ei_add_test(gpu_library_example "" "CUDA::cusolver")
# DeviceMatrix tests: only CUDA runtime, no NVIDIA libraries.
# DeviceMatrix tests: CUDA runtime + cuBLAS + cuSOLVER (for BLAS-1 ops via GpuContext).
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
add_executable(gpu_device_matrix gpu_device_matrix.cpp)
target_include_directories(gpu_device_matrix PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart)
target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart CUDA::cublas CUDA::cusolver CUDA::npps CUDA::nppc)
target_compile_definitions(gpu_device_matrix PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1)
@@ -547,11 +547,103 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
endif()
# cuFFT test (cuFFT is part of the CUDA toolkit — no separate option needed).
if(TARGET CUDA::cufft)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
add_executable(gpu_cufft gpu_cufft.cpp)
target_include_directories(gpu_cufft PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(gpu_cufft
Eigen3::Eigen CUDA::cudart CUDA::cufft CUDA::cublas)
target_compile_definitions(gpu_cufft PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1)
add_test(NAME gpu_cufft COMMAND gpu_cufft)
add_dependencies(buildtests gpu_cufft)
add_dependencies(buildtests_gpu gpu_cufft)
set_property(TEST gpu_cufft APPEND PROPERTY LABELS "Official;gpu")
set_property(TEST gpu_cufft PROPERTY SKIP_RETURN_CODE 77)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
endif()
# cuSPARSE SpMV test (cuSPARSE is part of the CUDA toolkit).
if(TARGET CUDA::cusparse)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
add_executable(gpu_cusparse_spmv gpu_cusparse_spmv.cpp)
target_include_directories(gpu_cusparse_spmv PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(gpu_cusparse_spmv
Eigen3::Eigen CUDA::cudart CUDA::cusparse CUDA::cublas CUDA::cusolver)
target_compile_definitions(gpu_cusparse_spmv PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1)
add_test(NAME gpu_cusparse_spmv COMMAND gpu_cusparse_spmv)
add_dependencies(buildtests gpu_cusparse_spmv)
add_dependencies(buildtests_gpu gpu_cusparse_spmv)
set_property(TEST gpu_cusparse_spmv APPEND PROPERTY LABELS "Official;gpu")
set_property(TEST gpu_cusparse_spmv PROPERTY SKIP_RETURN_CODE 77)
# End-to-end GPU CG test: Eigen's ConjugateGradient with DeviceMatrix.
add_executable(gpu_cg gpu_cg.cpp)
target_include_directories(gpu_cg PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(gpu_cg
Eigen3::Eigen CUDA::cudart CUDA::cusparse CUDA::cublas CUDA::cusolver CUDA::npps CUDA::nppc)
target_compile_definitions(gpu_cg PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1)
add_test(NAME gpu_cg COMMAND gpu_cg)
add_dependencies(buildtests gpu_cg)
add_dependencies(buildtests_gpu gpu_cg)
set_property(TEST gpu_cg APPEND PROPERTY LABELS "Official;gpu")
set_property(TEST gpu_cg PROPERTY SKIP_RETURN_CODE 77)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
endif()
option(EIGEN_TEST_CUSPARSE "Test cuSPARSE integration" OFF)
if(EIGEN_TEST_CUSPARSE AND TARGET CUDA::cusparse)
ei_add_test(gpu_cusparse "" "CUDA::cusparse")
endif()
# cuDSS sparse direct solver tests.
# cuDSS is distributed separately from the CUDA Toolkit.
option(EIGEN_TEST_CUDSS "Test cuDSS sparse solver integration" OFF)
if(EIGEN_TEST_CUDSS)
find_path(CUDSS_INCLUDE_DIR cudss.h
HINTS ${CUDSS_DIR}/include ${CUDA_TOOLKIT_ROOT_DIR}/include /usr/include)
find_library(CUDSS_LIBRARY cudss
HINTS ${CUDSS_DIR}/lib ${CUDSS_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib/x86_64-linux-gnu)
if(CUDSS_INCLUDE_DIR AND CUDSS_LIBRARY)
message(STATUS "cuDSS found: ${CUDSS_LIBRARY}")
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
foreach(_cudss_test IN ITEMS gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu)
add_executable(${_cudss_test} ${_cudss_test}.cpp)
target_include_directories(${_cudss_test} PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CUDSS_INCLUDE_DIR}"
"${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(${_cudss_test}
Eigen3::Eigen CUDA::cudart CUDA::cusolver CUDA::cublas ${CUDSS_LIBRARY})
target_compile_definitions(${_cudss_test} PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1
EIGEN_CUDSS=1)
add_test(NAME ${_cudss_test} COMMAND "${_cudss_test}")
add_dependencies(buildtests ${_cudss_test})
add_dependencies(buildtests_gpu ${_cudss_test})
set_property(TEST ${_cudss_test} APPEND PROPERTY LABELS "Official;gpu")
set_property(TEST ${_cudss_test} PROPERTY SKIP_RETURN_CODE 77)
endforeach()
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
else()
message(WARNING "EIGEN_TEST_CUDSS=ON but cuDSS not found. Set CUDSS_DIR.")
endif()
endif()
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
endif()

224
test/gpu_cg.cpp Normal file
View File

@@ -0,0 +1,224 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// End-to-end test: CG algorithm running on GPU via DeviceMatrix.
//
// Uses DeviceSparseView for SpMV, DeviceMatrix for vectors, DeviceScalar
// for deferred reductions. Verifies correctness against CPU ConjugateGradient.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a sparse SPD matrix --------------------------------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat R(n, n);
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
for (Index j = 0; j < n; ++j) {
for (Index i = 0; i < n; ++i) {
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
}
}
}
R.makeCompressed();
SpMat A = R.adjoint() * R;
for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
A.makeCompressed();
return A;
}
// ---- GPU CG without preconditioner ------------------------------------------
template <typename Scalar>
void test_gpu_cg(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
// CPU reference (identity preconditioner to match GPU).
ConjugateGradient<SpMat, Lower | Upper, IdentityPreconditioner> cpu_cg;
cpu_cg.setMaxIterations(1000);
cpu_cg.setTolerance(RealScalar(1e-8));
cpu_cg.compute(A);
Vec x_cpu = cpu_cg.solve(b);
VERIFY_IS_EQUAL(cpu_cg.info(), Success);
// GPU CG: mirrors Eigen's conjugate_gradient() using DeviceMatrix ops.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<Scalar> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(A);
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
DeviceMatrix<Scalar> d_x(n, 1);
d_x.setZero(ctx);
// r = b (since x=0)
DeviceMatrix<Scalar> residual(n, 1);
residual.copyFrom(ctx, d_b);
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
RealScalar tol = RealScalar(1e-8);
RealScalar threshold = tol * tol * rhsNorm2;
RealScalar residualNorm2 = residual.squaredNorm(ctx);
// p = r (no preconditioner)
DeviceMatrix<Scalar> p(n, 1);
p.copyFrom(ctx, residual);
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index maxIters = 1000;
Index i = 0;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
// z = r (no preconditioner)
z.copyFrom(ctx, residual);
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= Scalar(beta);
p += z;
i++;
}
GpuContext::setThreadLocal(nullptr);
Vec x_gpu = d_x.toHost(ctx.stream());
// Verify residual.
Vec r = A * x_gpu - b;
RealScalar relres = r.norm() / b.norm();
VERIFY(relres < RealScalar(1e-6));
// Compare with CPU.
RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
}
// ---- GPU CG with Jacobi preconditioner --------------------------------------
template <typename Scalar>
void test_gpu_cg_jacobi(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
// CPU reference.
ConjugateGradient<SpMat, Lower | Upper> cpu_cg;
cpu_cg.setMaxIterations(1000);
cpu_cg.setTolerance(RealScalar(1e-8));
cpu_cg.compute(A);
Vec x_cpu = cpu_cg.solve(b);
// Extract inverse diagonal.
Vec invdiag(n);
for (Index j = 0; j < A.outerSize(); ++j) {
typename SpMat::InnerIterator it(A, j);
while (it && it.index() != j) ++it;
if (it && it.index() == j && it.value() != Scalar(0))
invdiag(j) = Scalar(1) / it.value();
else
invdiag(j) = Scalar(1);
}
// GPU CG with Jacobi preconditioner.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<Scalar> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(A);
auto d_invdiag = DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
DeviceMatrix<Scalar> d_x(n, 1);
d_x.setZero(ctx);
DeviceMatrix<Scalar> residual(n, 1);
residual.copyFrom(ctx, d_b);
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
RealScalar tol = RealScalar(1e-8);
RealScalar threshold = tol * tol * rhsNorm2;
RealScalar residualNorm2 = residual.squaredNorm(ctx);
// p = precond.solve(r) = invdiag .* r
DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index maxIters = 1000;
Index i = 0;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
// z = precond.solve(r) = invdiag .* r
z.cwiseProduct(ctx, d_invdiag, residual);
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= beta;
p += z;
i++;
}
GpuContext::setThreadLocal(nullptr);
Vec x_gpu = d_x.toHost(ctx.stream());
Vec r = A * x_gpu - b;
RealScalar relres = r.norm() / b.norm();
VERIFY(relres < RealScalar(1e-6));
RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
}
EIGEN_DECLARE_TEST(gpu_cg) {
CALL_SUBTEST(test_gpu_cg<double>(64));
CALL_SUBTEST(test_gpu_cg<double>(256));
CALL_SUBTEST(test_gpu_cg<float>(64));
CALL_SUBTEST(test_gpu_cg_jacobi<double>(64));
CALL_SUBTEST(test_gpu_cg_jacobi<double>(256));
CALL_SUBTEST(test_gpu_cg_jacobi<float>(64));
}

154
test/gpu_cudss_ldlt.cpp Normal file
View File

@@ -0,0 +1,154 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSparseLDLT: GPU sparse LDL^T via cuDSS.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a random sparse symmetric indefinite matrix ---------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_symmetric_indefinite(Index n, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
// Build a random sparse matrix and symmetrize it.
// The diagonal has mixed signs to ensure indefiniteness.
SpMat R(n, n);
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
for (Index j = 0; j < n; ++j) {
for (Index i = 0; i < n; ++i) {
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
}
}
}
R.makeCompressed();
// A = R + R^H (symmetric), then add diagonal with alternating signs for indefiniteness.
SpMat A = R + SparseMatrix<Scalar, ColMajor, int>(R.adjoint());
for (Index i = 0; i < n; ++i) {
Scalar diag_val = Scalar((i % 2 == 0) ? n : -n);
A.coeffRef(i, i) += diag_val;
}
A.makeCompressed();
return A;
}
// ---- Solve and check residual -----------------------------------------------
template <typename Scalar>
void test_solve(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_symmetric_indefinite<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLDLT<Scalar> ldlt(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
Vec x = ldlt.solve(b);
VERIFY_IS_EQUAL(x.rows(), n);
Vec r = A * x - b;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(r.norm() / b.norm() < tol);
}
// ---- Multiple RHS -----------------------------------------------------------
template <typename Scalar>
void test_multiple_rhs(Index n, Index nrhs) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_symmetric_indefinite<Scalar>(n);
Mat B = Mat::Random(n, nrhs);
GpuSparseLDLT<Scalar> ldlt(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
Mat X = ldlt.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
Mat R = A * X - B;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(R.norm() / B.norm() < tol);
}
// ---- Refactorize ------------------------------------------------------------
template <typename Scalar>
void test_refactorize(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_symmetric_indefinite<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLDLT<Scalar> ldlt;
ldlt.analyzePattern(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
ldlt.factorize(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
Vec x1 = ldlt.solve(b);
// Modify values, keep pattern.
SpMat A2 = A;
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
ldlt.factorize(A2);
VERIFY_IS_EQUAL(ldlt.info(), Success);
Vec x2 = ldlt.solve(b);
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((A * x1 - b).norm() / b.norm() < tol);
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
}
// ---- Empty ------------------------------------------------------------------
void test_empty() {
using SpMat = SparseMatrix<double, ColMajor, int>;
SpMat A(0, 0);
A.makeCompressed();
GpuSparseLDLT<double> ldlt(A);
VERIFY_IS_EQUAL(ldlt.info(), Success);
VERIFY_IS_EQUAL(ldlt.rows(), 0);
VERIFY_IS_EQUAL(ldlt.cols(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_solve<Scalar>(64));
CALL_SUBTEST(test_solve<Scalar>(256));
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
CALL_SUBTEST(test_refactorize<Scalar>(64));
}
EIGEN_DECLARE_TEST(gpu_cudss_ldlt) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_empty());
}

202
test/gpu_cudss_llt.cpp Normal file
View File

@@ -0,0 +1,202 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSparseLLT: GPU sparse Cholesky via cuDSS.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a random sparse SPD matrix -------------------------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using RealScalar = typename NumTraits<Scalar>::Real;
// Uses the global std::rand state seeded by the test framework (g_seed).
SpMat R(n, n);
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
for (Index j = 0; j < n; ++j) {
for (Index i = 0; i < n; ++i) {
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
}
}
}
R.makeCompressed();
// A = R^H * R + n * I (guaranteed SPD).
SpMat A = R.adjoint() * R;
for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
A.makeCompressed();
return A;
}
// ---- Solve and check residual -----------------------------------------------
template <typename Scalar>
void test_solve(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLLT<Scalar> llt(A);
VERIFY_IS_EQUAL(llt.info(), Success);
Vec x = llt.solve(b);
VERIFY_IS_EQUAL(x.rows(), n);
// Check residual: ||Ax - b|| / ||b||.
Vec r = A * x - b;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(r.norm() / b.norm() < tol);
}
// ---- Compare with CPU SimplicialLLT -----------------------------------------
template <typename Scalar>
void test_vs_cpu(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLLT<Scalar> gpu_llt(A);
VERIFY_IS_EQUAL(gpu_llt.info(), Success);
Vec x_gpu = gpu_llt.solve(b);
SimplicialLLT<SpMat> cpu_llt(A);
VERIFY_IS_EQUAL(cpu_llt.info(), Success);
Vec x_cpu = cpu_llt.solve(b);
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((x_gpu - x_cpu).norm() / x_cpu.norm() < tol);
}
// ---- Multiple RHS -----------------------------------------------------------
template <typename Scalar>
void test_multiple_rhs(Index n, Index nrhs) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Mat B = Mat::Random(n, nrhs);
GpuSparseLLT<Scalar> llt(A);
VERIFY_IS_EQUAL(llt.info(), Success);
Mat X = llt.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
Mat R = A * X - B;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(R.norm() / B.norm() < tol);
}
// ---- Separate analyze + factorize (refactorization) -------------------------
template <typename Scalar>
void test_refactorize(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLLT<Scalar> llt;
llt.analyzePattern(A);
VERIFY_IS_EQUAL(llt.info(), Success);
// First factorize + solve.
llt.factorize(A);
VERIFY_IS_EQUAL(llt.info(), Success);
Vec x1 = llt.solve(b);
// Modify values (keep same pattern): scale diagonal.
SpMat A2 = A;
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
// Refactorize with same pattern.
llt.factorize(A2);
VERIFY_IS_EQUAL(llt.info(), Success);
Vec x2 = llt.solve(b);
// Both solutions should satisfy their respective systems.
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((A * x1 - b).norm() / b.norm() < tol);
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
// Solutions should differ (A2 != A).
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
}
// ---- Empty matrix -----------------------------------------------------------
void test_empty() {
using SpMat = SparseMatrix<double, ColMajor, int>;
SpMat A(0, 0);
A.makeCompressed();
GpuSparseLLT<double> llt(A);
VERIFY_IS_EQUAL(llt.info(), Success);
VERIFY_IS_EQUAL(llt.rows(), 0);
VERIFY_IS_EQUAL(llt.cols(), 0);
}
// ---- Upper triangle ---------------------------------------------------------
template <typename Scalar>
void test_upper(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLLT<Scalar, Upper> llt(A);
VERIFY_IS_EQUAL(llt.info(), Success);
Vec x = llt.solve(b);
Vec r = A * x - b;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(r.norm() / b.norm() < tol);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_solve<Scalar>(64));
CALL_SUBTEST(test_solve<Scalar>(256));
CALL_SUBTEST(test_vs_cpu<Scalar>(64));
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
CALL_SUBTEST(test_refactorize<Scalar>(64));
CALL_SUBTEST(test_upper<Scalar>(64));
}
EIGEN_DECLARE_TEST(gpu_cudss_llt) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_empty());
}

147
test/gpu_cudss_lu.cpp Normal file
View File

@@ -0,0 +1,147 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSparseLU: GPU sparse LU via cuDSS.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a random sparse non-singular general matrix ---------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_general(Index n, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat R(n, n);
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
for (Index j = 0; j < n; ++j) {
for (Index i = 0; i < n; ++i) {
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
}
}
}
// Add strong diagonal for non-singularity.
for (Index i = 0; i < n; ++i) R.coeffRef(i, i) += Scalar(RealScalar(n));
R.makeCompressed();
return R;
}
// ---- Solve and check residual -----------------------------------------------
template <typename Scalar>
void test_solve(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_general<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLU<Scalar> lu(A);
VERIFY_IS_EQUAL(lu.info(), Success);
Vec x = lu.solve(b);
VERIFY_IS_EQUAL(x.rows(), n);
Vec r = A * x - b;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(r.norm() / b.norm() < tol);
}
// ---- Multiple RHS -----------------------------------------------------------
template <typename Scalar>
void test_multiple_rhs(Index n, Index nrhs) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_general<Scalar>(n);
Mat B = Mat::Random(n, nrhs);
GpuSparseLU<Scalar> lu(A);
VERIFY_IS_EQUAL(lu.info(), Success);
Mat X = lu.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
Mat R = A * X - B;
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(R.norm() / B.norm() < tol);
}
// ---- Refactorize ------------------------------------------------------------
template <typename Scalar>
void test_refactorize(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_general<Scalar>(n);
Vec b = Vec::Random(n);
GpuSparseLU<Scalar> lu;
lu.analyzePattern(A);
VERIFY_IS_EQUAL(lu.info(), Success);
lu.factorize(A);
VERIFY_IS_EQUAL(lu.info(), Success);
Vec x1 = lu.solve(b);
// Modify values, keep pattern.
SpMat A2 = A;
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
lu.factorize(A2);
VERIFY_IS_EQUAL(lu.info(), Success);
Vec x2 = lu.solve(b);
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((A * x1 - b).norm() / b.norm() < tol);
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
}
// ---- Empty ------------------------------------------------------------------
void test_empty() {
using SpMat = SparseMatrix<double, ColMajor, int>;
SpMat A(0, 0);
A.makeCompressed();
GpuSparseLU<double> lu(A);
VERIFY_IS_EQUAL(lu.info(), Success);
VERIFY_IS_EQUAL(lu.rows(), 0);
VERIFY_IS_EQUAL(lu.cols(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_solve<Scalar>(64));
CALL_SUBTEST(test_solve<Scalar>(256));
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
CALL_SUBTEST(test_refactorize<Scalar>(64));
}
EIGEN_DECLARE_TEST(gpu_cudss_lu) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_empty());
}

186
test/gpu_cufft.cpp Normal file
View File

@@ -0,0 +1,186 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuFFT: GPU FFT via cuFFT.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/GPU>
using namespace Eigen;
// ---- 1D C2C roundtrip: inv(fwd(x)) ≈ x -------------------------------------
template <typename Scalar>
void test_c2c_roundtrip(Index n) {
using Complex = std::complex<Scalar>;
using Vec = Matrix<Complex, Dynamic, 1>;
using RealScalar = Scalar;
Vec x = Vec::Random(n);
GpuFFT<Scalar> fft;
Vec X = fft.fwd(x);
VERIFY_IS_EQUAL(X.size(), n);
Vec y = fft.inv(X);
VERIFY_IS_EQUAL(y.size(), n);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y - x).norm() / x.norm() < tol);
}
// ---- 1D C2C known signal: FFT of constant = delta --------------------------
template <typename Scalar>
void test_c2c_constant() {
using Complex = std::complex<Scalar>;
using Vec = Matrix<Complex, Dynamic, 1>;
using RealScalar = Scalar;
const int n = 64;
Vec x = Vec::Constant(n, Complex(3.0, 0.0));
GpuFFT<Scalar> fft;
Vec X = fft.fwd(x);
// FFT of constant c: X[0] = c*n, X[k] = 0 for k > 0.
RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(n);
VERIFY(std::abs(X(0) - Complex(3.0 * n, 0.0)) < tol);
for (int k = 1; k < n; ++k) {
VERIFY(std::abs(X(k)) < tol);
}
}
// ---- 1D R2C/C2R roundtrip: invReal(fwd(r), n) ≈ r --------------------------
template <typename Scalar>
void test_r2c_roundtrip(Index n) {
using Complex = std::complex<Scalar>;
using CVec = Matrix<Complex, Dynamic, 1>;
using RVec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = Scalar;
RVec r = RVec::Random(n);
GpuFFT<Scalar> fft;
CVec R = fft.fwd(r);
// R2C returns n/2+1 complex values.
VERIFY_IS_EQUAL(R.size(), n / 2 + 1);
RVec s = fft.invReal(R, n);
VERIFY_IS_EQUAL(s.size(), n);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((s - r).norm() / r.norm() < tol);
}
// ---- 2D C2C roundtrip: inv2d(fwd2d(A)) ≈ A ---------------------------------
template <typename Scalar>
void test_2d_roundtrip(Index rows, Index cols) {
using Complex = std::complex<Scalar>;
using Mat = Matrix<Complex, Dynamic, Dynamic>;
using RealScalar = Scalar;
Mat A = Mat::Random(rows, cols);
GpuFFT<Scalar> fft;
Mat B = fft.fwd2d(A);
VERIFY_IS_EQUAL(B.rows(), rows);
VERIFY_IS_EQUAL(B.cols(), cols);
Mat C = fft.inv2d(B);
VERIFY_IS_EQUAL(C.rows(), rows);
VERIFY_IS_EQUAL(C.cols(), cols);
RealScalar tol = RealScalar(10) * RealScalar(rows * cols) * NumTraits<Scalar>::epsilon();
VERIFY((C - A).norm() / A.norm() < tol);
}
// ---- 2D C2C known signal: constant matrix -----------------------------------
template <typename Scalar>
void test_2d_constant() {
using Complex = std::complex<Scalar>;
using Mat = Matrix<Complex, Dynamic, Dynamic>;
using RealScalar = Scalar;
const int rows = 16, cols = 32;
Mat A = Mat::Constant(rows, cols, Complex(2.0, 0.0));
GpuFFT<Scalar> fft;
Mat B = fft.fwd2d(A);
// 2D FFT of constant c: B(0,0) = c*rows*cols, all others = 0.
RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(rows * cols);
VERIFY(std::abs(B(0, 0) - Complex(2.0 * rows * cols, 0.0)) < tol);
for (int j = 0; j < cols; ++j) {
for (int i = 0; i < rows; ++i) {
if (i == 0 && j == 0) continue;
VERIFY(std::abs(B(i, j)) < tol);
}
}
}
// ---- Plan reuse: repeated calls should work ---------------------------------
template <typename Scalar>
void test_plan_reuse() {
using Complex = std::complex<Scalar>;
using Vec = Matrix<Complex, Dynamic, 1>;
using RealScalar = Scalar;
GpuFFT<Scalar> fft;
for (int trial = 0; trial < 5; ++trial) {
Vec x = Vec::Random(128);
Vec X = fft.fwd(x);
Vec y = fft.inv(X);
RealScalar tol = RealScalar(10) * RealScalar(128) * NumTraits<Scalar>::epsilon();
VERIFY((y - x).norm() / x.norm() < tol);
}
}
// ---- Empty ------------------------------------------------------------------
template <typename Scalar>
void test_empty() {
using Complex = std::complex<Scalar>;
using Vec = Matrix<Complex, Dynamic, 1>;
GpuFFT<Scalar> fft;
Vec x(0);
Vec X = fft.fwd(x);
VERIFY_IS_EQUAL(X.size(), 0);
Vec y = fft.inv(X);
VERIFY_IS_EQUAL(y.size(), 0);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(64));
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(256));
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(1000)); // non-power-of-2
CALL_SUBTEST(test_c2c_constant<Scalar>());
CALL_SUBTEST(test_r2c_roundtrip<Scalar>(64));
CALL_SUBTEST(test_r2c_roundtrip<Scalar>(256));
CALL_SUBTEST(test_2d_roundtrip<Scalar>(32, 32));
CALL_SUBTEST(test_2d_roundtrip<Scalar>(16, 64)); // non-square
CALL_SUBTEST(test_2d_constant<Scalar>());
CALL_SUBTEST(test_plan_reuse<Scalar>());
CALL_SUBTEST(test_empty<Scalar>());
}
EIGEN_DECLARE_TEST(gpu_cufft) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
}

305
test/gpu_cusparse_spmv.cpp Normal file
View File

@@ -0,0 +1,305 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for GpuSparseContext: GPU SpMV/SpMM via cuSPARSE.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a random sparse matrix -----------------------------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_sparse(Index rows, Index cols, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat R(rows, cols);
R.reserve(VectorXi::Constant(cols, static_cast<int>(rows * density) + 1));
for (Index j = 0; j < cols; ++j) {
for (Index i = 0; i < rows; ++i) {
if ((std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(RealScalar(std::rand() / double(RAND_MAX) - 0.5));
}
}
}
R.makeCompressed();
return R;
}
// ---- SpMV: y = A * x -------------------------------------------------------
template <typename Scalar>
void test_spmv(Index rows, Index cols) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(rows, cols);
Vec x = Vec::Random(cols);
GpuSparseContext<Scalar> ctx;
Vec y_gpu = ctx.multiply(A, x);
Vec y_cpu = A * x;
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
VERIFY_IS_EQUAL(y_gpu.size(), rows);
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- SpMV with alpha/beta: y = alpha*A*x + beta*y ---------------------------
template <typename Scalar>
void test_spmv_alpha_beta(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(n, n);
Vec x = Vec::Random(n);
Vec y_init = Vec::Random(n);
Scalar alpha(2);
Scalar beta(3);
Vec y_cpu = alpha * (A * x) + beta * y_init;
GpuSparseContext<Scalar> ctx;
Vec y_gpu = y_init;
ctx.multiply(A, x, y_gpu, alpha, beta);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- Transpose: y = A^T * x ------------------------------------------------
template <typename Scalar>
void test_spmv_transpose(Index rows, Index cols) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(rows, cols);
Vec x = Vec::Random(rows);
GpuSparseContext<Scalar> ctx;
Vec y_gpu = ctx.multiplyT(A, x);
Vec y_cpu = A.transpose() * x;
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
VERIFY_IS_EQUAL(y_gpu.size(), cols);
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- SpMM: Y = A * X (multiple RHS) ----------------------------------------
template <typename Scalar>
void test_spmm(Index rows, Index cols, Index nrhs) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(rows, cols);
Mat X = Mat::Random(cols, nrhs);
GpuSparseContext<Scalar> ctx;
Mat Y_gpu = ctx.multiplyMat(A, X);
Mat Y_cpu = A * X;
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
VERIFY_IS_EQUAL(Y_gpu.rows(), rows);
VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);
VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- Identity matrix: I * x = x --------------------------------------------
template <typename Scalar>
void test_identity(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
// Build sparse identity.
SpMat eye(n, n);
eye.setIdentity();
eye.makeCompressed();
Vec x = Vec::Random(n);
GpuSparseContext<Scalar> ctx;
Vec y = ctx.multiply(eye, x);
RealScalar tol = NumTraits<Scalar>::epsilon();
VERIFY((y - x).norm() < tol);
}
// ---- Context reuse ----------------------------------------------------------
template <typename Scalar>
void test_reuse(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
GpuSparseContext<Scalar> ctx;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
for (int trial = 0; trial < 3; ++trial) {
SpMat A = make_sparse<Scalar>(n, n);
Vec x = Vec::Random(n);
Vec y_gpu = ctx.multiply(A, x);
Vec y_cpu = A * x;
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
}
// ---- Empty ------------------------------------------------------------------
template <typename Scalar>
void test_empty() {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
SpMat A(0, 0);
A.makeCompressed();
Vec x(0);
GpuSparseContext<Scalar> ctx;
Vec y = ctx.multiply(A, x);
VERIFY_IS_EQUAL(y.size(), 0);
}
// ---- DeviceMatrix SpMV (no host roundtrip) ----------------------------------
template <typename Scalar>
void test_spmv_device(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(n, n);
Vec x = Vec::Random(n);
// Use shared GpuContext for same-stream execution.
GpuContext gpu_ctx;
GpuSparseContext<Scalar> ctx(gpu_ctx);
auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
DeviceMatrix<Scalar> d_y;
ctx.multiply(A, d_x, d_y);
Vec y_gpu = d_y.toHost(gpu_ctx.stream());
Vec y_cpu = A * x;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- Expression syntax: d_y = d_A * d_x ------------------------------------
template <typename Scalar>
void test_spmv_expr(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(n, n);
Vec x = Vec::Random(n);
GpuContext gpu_ctx;
GpuSparseContext<Scalar> ctx(gpu_ctx);
// Upload sparse matrix and create device view.
auto d_A = ctx.deviceView(A);
// Upload x.
auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
// Expression syntax: d_y = d_A * d_x
DeviceMatrix<Scalar> d_y;
d_y = d_A * d_x;
// Also test with noalias():
DeviceMatrix<Scalar> d_tmp;
d_tmp.noalias() = d_A * d_x;
Vec y_gpu = d_y.toHost(gpu_ctx.stream());
Vec tmp_gpu = d_tmp.toHost(gpu_ctx.stream());
Vec y_cpu = A * x;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
VERIFY((tmp_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- deviceView overwrite: second view replaces first -----------------------
template <typename Scalar>
void test_deviceview_overwrite(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A1 = make_sparse<Scalar>(n, n);
SpMat A2 = make_sparse<Scalar>(n, n); // different random matrix
Vec x = Vec::Random(n);
GpuContext gpu_ctx;
GpuSparseContext<Scalar> ctx(gpu_ctx);
// First view: A1.
auto d_A1 = ctx.deviceView(A1);
auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
DeviceMatrix<Scalar> d_y1;
d_y1 = d_A1 * d_x;
Vec y1_gpu = d_y1.toHost(gpu_ctx.stream());
Vec y1_cpu = A1 * x;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y1_gpu - y1_cpu).norm() / (y1_cpu.norm() + RealScalar(1)) < tol);
// Second view overwrites first: now uses A2.
auto d_A2 = ctx.deviceView(A2);
DeviceMatrix<Scalar> d_y2;
d_y2 = d_A2 * d_x;
Vec y2_gpu = d_y2.toHost(gpu_ctx.stream());
Vec y2_cpu = A2 * x;
VERIFY((y2_gpu - y2_cpu).norm() / (y2_cpu.norm() + RealScalar(1)) < tol);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_spmv<Scalar>(64, 64));
CALL_SUBTEST(test_spmv<Scalar>(128, 64)); // non-square
CALL_SUBTEST(test_spmv<Scalar>(64, 128)); // wide
CALL_SUBTEST(test_spmv_alpha_beta<Scalar>(64));
CALL_SUBTEST(test_spmv_transpose<Scalar>(128, 64));
CALL_SUBTEST(test_spmm<Scalar>(64, 64, 4));
CALL_SUBTEST(test_identity<Scalar>(64));
CALL_SUBTEST(test_reuse<Scalar>(64));
CALL_SUBTEST(test_empty<Scalar>());
CALL_SUBTEST(test_spmv_device<Scalar>(64));
CALL_SUBTEST(test_spmv_expr<Scalar>(64));
CALL_SUBTEST(test_deviceview_overwrite<Scalar>(64));
}
EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
}

View File

@@ -12,6 +12,7 @@
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU>
using namespace Eigen;
@@ -230,6 +231,217 @@ void test_scalar() {
CALL_SUBTEST(test_move_assign<Scalar>(64, 64));
}
// ---- BLAS-1: dot product ----------------------------------------------------
template <typename Scalar>
void test_blas1(Index n) {
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
// All BLAS-1 ops share one GpuContext — same stream, zero event overhead.
GpuContext ctx;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
// dot
{
Vec a = Vec::Random(n);
Vec b = Vec::Random(n);
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
Scalar gpu_dot = d_a.dot(ctx, d_b);
Scalar cpu_dot = a.dot(b);
VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
}
// norm / squaredNorm
{
Vec a = Vec::Random(n);
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
RealScalar gpu_norm = d_a.norm(ctx);
RealScalar cpu_norm = a.norm();
VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol);
RealScalar gpu_sqnorm = d_a.squaredNorm(ctx);
RealScalar cpu_sqnorm = a.squaredNorm();
VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol);
}
// addScaled (axpy)
{
Vec x = Vec::Random(n);
Vec y = Vec::Random(n);
Scalar alpha(2.5);
Vec y_ref = y + alpha * x;
auto d_y = DeviceMatrix<Scalar>::fromHost(y, ctx.stream());
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
d_y.addScaled(ctx, alpha, d_x);
Vec y_gpu = d_y.toHost(ctx.stream());
VERIFY((y_gpu - y_ref).norm() < tol * y_ref.norm() + tol);
}
// scale (scal)
{
Vec x = Vec::Random(n);
Scalar alpha(3.0);
Vec x_ref = alpha * x;
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
d_x.scale(ctx, alpha);
Vec x_gpu = d_x.toHost(ctx.stream());
VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol);
}
// copyFrom
{
Vec x = Vec::Random(n);
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
DeviceMatrix<Scalar> d_y;
d_y.copyFrom(ctx, d_x);
Vec y = d_y.toHost(ctx.stream());
VERIFY_IS_APPROX(y, x);
}
// setZero
{
Vec x = Vec::Random(n);
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
d_x.setZero(ctx);
Vec result = d_x.toHost(ctx.stream());
VERIFY_IS_EQUAL(result, Vec::Zero(n));
}
}
// ---- BLAS-1 operator overloads (CG-style) -----------------------------------
template <typename Scalar>
void test_cg_operators(Index n) {
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
Vec x = Vec::Random(n);
Vec p = Vec::Random(n);
Vec tmp = Vec::Random(n);
Vec z = Vec::Random(n);
Scalar alpha(2.5);
Scalar beta(0.7);
// Test: x += alpha * p
{
Vec x_ref = x + alpha * p;
auto d_x = DeviceMatrix<Scalar>::fromHost(x);
auto d_p = DeviceMatrix<Scalar>::fromHost(p);
d_x += alpha * d_p;
Vec x_gpu = d_x.toHost();
VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol);
}
// Test: r -= alpha * tmp
{
Vec r = Vec::Random(n);
Vec r_ref = r - alpha * tmp;
auto d_r = DeviceMatrix<Scalar>::fromHost(r);
auto d_tmp = DeviceMatrix<Scalar>::fromHost(tmp);
d_r -= alpha * d_tmp;
Vec r_gpu = d_r.toHost();
VERIFY((r_gpu - r_ref).norm() < tol * r_ref.norm() + tol);
}
// Test: p = z + beta * p (cuBLAS geam)
{
Vec p_copy = p;
Vec p_ref = z + beta * p_copy;
auto d_p = DeviceMatrix<Scalar>::fromHost(p_copy);
auto d_z = DeviceMatrix<Scalar>::fromHost(z);
d_p = d_z + beta * d_p;
Vec p_gpu = d_p.toHost();
VERIFY((p_gpu - p_ref).norm() < tol * p_ref.norm() + tol);
}
// Test: operator+= and operator-= with DeviceMatrix (no scalar)
{
Vec a = Vec::Random(n);
Vec b = Vec::Random(n);
Vec a_ref = a + b;
auto d_a = DeviceMatrix<Scalar>::fromHost(a);
auto d_b = DeviceMatrix<Scalar>::fromHost(b);
d_a += d_b;
VERIFY((d_a.toHost() - a_ref).norm() < tol * a_ref.norm() + tol);
}
}
// ---- DeviceScalar: deferred sync -------------------------------------------
template <typename Scalar>
void test_device_scalar() {
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
const Index n = 256;
Vec a = Vec::Random(n);
Vec b = Vec::Random(n);
GpuContext ctx;
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
// dot() returns DeviceScalar — implicit conversion to Scalar syncs.
Scalar gpu_dot = d_a.dot(ctx, d_b);
Scalar cpu_dot = a.dot(b);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
// squaredNorm() returns host RealScalar directly (syncs internally).
RealScalar gpu_sqnorm = d_a.squaredNorm(ctx);
RealScalar cpu_sqnorm = a.squaredNorm();
VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol);
// norm() returns DeviceScalar<RealScalar> — implicit conversion syncs.
RealScalar gpu_norm = d_a.norm(ctx);
RealScalar cpu_norm = a.norm();
VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol);
// Convenience overloads (thread-local context).
GpuContext::setThreadLocal(&ctx);
Scalar gpu_dot2 = d_a.dot(d_b);
VERIFY(numext::abs(gpu_dot2 - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
GpuContext::setThreadLocal(nullptr);
// Empty vectors: dot and norm must return zero.
{
DeviceMatrix<Scalar> d_empty(0, 1);
DeviceMatrix<Scalar> d_empty2(0, 1);
Scalar empty_dot = d_empty.dot(ctx, d_empty2);
VERIFY_IS_EQUAL(empty_dot, Scalar(0));
RealScalar empty_sqnorm = d_empty.squaredNorm(ctx);
VERIFY_IS_EQUAL(empty_sqnorm, RealScalar(0));
RealScalar empty_norm = d_empty.norm(ctx);
VERIFY_IS_EQUAL(empty_norm, RealScalar(0));
}
}
// ---- cwiseProduct -----------------------------------------------------------
template <typename Scalar>
void test_cwiseProduct() {
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
const Index n = 256;
Vec a = Vec::Random(n);
Vec b = Vec::Random(n);
Vec ref = a.array() * b.array();
GpuContext ctx;
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
auto d_c = d_a.cwiseProduct(ctx, d_b);
Vec result = d_c.toHost(ctx.stream());
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((result - ref).norm() < tol * ref.norm() + tol);
}
EIGEN_DECLARE_TEST(gpu_device_matrix) {
CALL_SUBTEST(test_default_construct());
CALL_SUBTEST(test_empty());
@@ -242,4 +454,18 @@ EIGEN_DECLARE_TEST(gpu_device_matrix) {
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_blas1<float>(256));
CALL_SUBTEST(test_blas1<double>(256));
CALL_SUBTEST(test_blas1<std::complex<float>>(256));
CALL_SUBTEST(test_blas1<std::complex<double>>(256));
CALL_SUBTEST(test_cg_operators<float>(256));
CALL_SUBTEST(test_cg_operators<double>(256));
CALL_SUBTEST(test_cg_operators<std::complex<float>>(256));
CALL_SUBTEST(test_cg_operators<std::complex<double>>(256));
CALL_SUBTEST(test_device_scalar<float>());
CALL_SUBTEST(test_device_scalar<double>());
CALL_SUBTEST(test_device_scalar<std::complex<float>>());
CALL_SUBTEST(test_device_scalar<std::complex<double>>());
CALL_SUBTEST(test_cwiseProduct<float>());
CALL_SUBTEST(test_cwiseProduct<double>());
}