Compare commits

...

1 Commits

Author SHA1 Message Date
Rasmus Munk Larsen
014f12f11a GPU: Add BLAS-1 ops, DeviceScalar, device-resident SpMV, and CG interop (5/5)
Add the operator interface needed for GPU iterative solvers:

- BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(),
  noalias(), operator+=/-=/\*= dispatching to cuBLAS axpy/scal/dot/nrm2.
- DeviceScalar<Scalar>: device-resident scalar returned by reductions.
  Defers host sync until value is read (implicit conversion). Device-side
  division via NPP for real types.
- GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(),
  cusparseHandle().
- GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm
  selection and plan caching.
- GpuSparseContext: GpuContext& constructor for same-stream execution,
  deviceView() returning DeviceSparseView with operator* for device-resident
  SpMV (d_y = d_A * d_x).
- geam expressions: d_C = d_A + alpha * d_B via cublasXgeam.
- GpuSVD::matrixV() convenience wrapper.

These additions make DeviceMatrix usable as a VectorType in Eigen algorithm
templates. Conjugate gradient is the motivating example and is tested against
CPU ConjugateGradient for correctness.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 20:19:59 -07:00
22 changed files with 3363 additions and 151 deletions

View File

@@ -39,6 +39,7 @@
#ifdef EIGEN_USE_GPU #ifdef EIGEN_USE_GPU
// IWYU pragma: begin_exports // IWYU pragma: begin_exports
#include "src/GPU/DeviceScalar.h"
#include "src/GPU/DeviceMatrix.h" #include "src/GPU/DeviceMatrix.h"
#include "src/GPU/GpuContext.h" #include "src/GPU/GpuContext.h"
#include "src/GPU/DeviceExpr.h" #include "src/GPU/DeviceExpr.h"
@@ -53,8 +54,10 @@
#include "src/GPU/CuFftSupport.h" #include "src/GPU/CuFftSupport.h"
#include "src/GPU/GpuFFT.h" #include "src/GPU/GpuFFT.h"
#include "src/GPU/CuSparseSupport.h" #include "src/GPU/CuSparseSupport.h"
#ifdef EIGEN_SPARSECORE_MODULE_H
#include "src/GPU/GpuSparseContext.h" #include "src/GPU/GpuSparseContext.h"
#ifdef EIGEN_CUDSS #endif
#if defined(EIGEN_CUDSS) && defined(EIGEN_SPARSECORE_MODULE_H)
#include "src/GPU/CuDssSupport.h" #include "src/GPU/CuDssSupport.h"
#include "src/GPU/GpuSparseSolverBase.h" #include "src/GPU/GpuSparseSolverBase.h"
#include "src/GPU/GpuSparseLLT.h" #include "src/GPU/GpuSparseLLT.h"

View File

@@ -21,6 +21,7 @@
#include "./GpuSupport.h" #include "./GpuSupport.h"
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cublasLt.h>
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
@@ -50,14 +51,14 @@ constexpr cublasOperation_t to_cublas_op(GpuOp op) {
} }
// ---- Scalar → cublasComputeType_t ------------------------------------------- // ---- Scalar → cublasComputeType_t -------------------------------------------
// cublasGemmEx requires a compute type (separate from the data type). // cublasLtMatmul requires a compute type (separate from the data type).
// //
// Precision policy: // Precision policy:
// - Default: tensor core algorithms enabled (CUBLAS_GEMM_DEFAULT_TENSOR_OP). // - Default: tensor core algorithms enabled via cublasLtMatmul heuristics.
// For double, cuBLAS may use Ozaki emulation on sm_80+ tensor cores. // For double, cuBLAS may use Ozaki emulation on sm_80+ tensor cores.
// - EIGEN_CUDA_TF32: opt-in to TF32 for float (~2x faster, 10-bit mantissa). // - EIGEN_CUDA_TF32: opt-in to TF32 for float (~2x faster, 10-bit mantissa).
// - EIGEN_NO_CUDA_TENSOR_OPS: disables all tensor core usage. Uses pedantic // - EIGEN_NO_CUDA_TENSOR_OPS: disables all tensor core usage. Uses pedantic
// compute types and CUBLAS_GEMM_DEFAULT algorithm. For bit-exact reproducibility. // compute types. For bit-exact reproducibility.
template <typename Scalar> template <typename Scalar>
struct cuda_compute_type; struct cuda_compute_type;
@@ -98,8 +99,47 @@ struct cuda_compute_type<std::complex<double>> {
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F; static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
#endif #endif
}; };
// ---- GEMM algorithm hint ---------------------------------------------------- // ---- Alpha/beta scalar type for cublasLtMatmul ------------------------------
// For standard types, alpha/beta match the scalar type.
template <typename Scalar>
struct cuda_gemm_scalar {
using type = Scalar;
};
// ---- cublasLt GEMM dispatch -------------------------------------------------
// Wraps cublasLtMatmul with descriptor setup, heuristic algorithm selection,
// and lazy workspace management. Supports 64-bit dimensions natively.
//
// The workspace buffer (DeviceBuffer*) is grown lazily to match the selected
// algorithm's actual requirement. The heuristic is queried with a generous
// 32 MB cap so that the best algorithm is never excluded. Growth is monotonic:
// the buffer only grows, never shrinks, so reallocation happens at most a few
// times during the lifetime of the owning GpuContext or solver.
//
// EIGEN_NO_CUDA_TENSOR_OPS: pedantic compute types (CUBLAS_COMPUTE_32F_PEDANTIC,
// CUBLAS_COMPUTE_64F_PEDANTIC) prevent cublasLt from selecting tensor core
// algorithms, matching the previous cublasGemmEx behavior.
//
// Thread safety: the workspace buffer is not thread-safe. All GEMM calls
// sharing a workspace must be on the same CUDA stream (guaranteed by GpuContext's
// single-stream design and by each GpuSVD owning its own stream).
//
// Future optimization: for hot loops (e.g., CG iteration), caching descriptors
// and the selected algorithm by (m, n, k, dtype, transA, transB) would avoid
// per-call descriptor creation and heuristic lookup overhead.
#define EIGEN_CUBLASLT_CHECK(expr) \
do { \
cublasStatus_t _s = (expr); \
eigen_assert(_s == CUBLAS_STATUS_SUCCESS && "cuBLASLt call failed"); \
} while (0)
// Maximum workspace the heuristic is allowed to consider. This is a preference
// ceiling, not an allocation — actual allocation matches the selected algorithm.
static constexpr size_t kCublasLtMaxWorkspaceBytes = 32 * 1024 * 1024; // 32 MB
// cublasGemmEx fallback algorithm hint (used when cublasLt heuristic returns no results).
constexpr cublasGemmAlgo_t cuda_gemm_algo() { constexpr cublasGemmAlgo_t cuda_gemm_algo() {
#ifdef EIGEN_NO_CUDA_TENSOR_OPS #ifdef EIGEN_NO_CUDA_TENSOR_OPS
return CUBLAS_GEMM_DEFAULT; return CUBLAS_GEMM_DEFAULT;
@@ -108,13 +148,73 @@ constexpr cublasGemmAlgo_t cuda_gemm_algo() {
#endif #endif
} }
// ---- Alpha/beta scalar type for cublasGemmEx --------------------------------
// For standard types, alpha/beta match the scalar type.
template <typename Scalar> template <typename Scalar>
struct cuda_gemm_scalar { void cublaslt_gemm(cublasLtHandle_t lt_handle, cublasHandle_t cublas_handle, cublasOperation_t transA,
using type = Scalar; cublasOperation_t transB, int64_t m, int64_t n, int64_t k,
}; const typename cuda_gemm_scalar<Scalar>::type* alpha, const Scalar* A, int64_t lda, const Scalar* B,
int64_t ldb, const typename cuda_gemm_scalar<Scalar>::type* beta, Scalar* C, int64_t ldc,
DeviceBuffer* workspace, cudaStream_t stream) {
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
using AlphaType = typename cuda_gemm_scalar<Scalar>::type;
constexpr cudaDataType_t alpha_type = cuda_data_type<AlphaType>::value;
// Matmul descriptor.
cublasLtMatmulDesc_t matmul_desc = nullptr;
EIGEN_CUBLASLT_CHECK(cublasLtMatmulDescCreate(&matmul_desc, compute, alpha_type));
EIGEN_CUBLASLT_CHECK(
cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, sizeof(transA)));
EIGEN_CUBLASLT_CHECK(
cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, sizeof(transB)));
// Matrix layout descriptors (column-major).
// Physical layout dimensions: rows × cols with leading dimension lda/ldb/ldc.
const int64_t a_rows = (transA == CUBLAS_OP_N) ? m : k;
const int64_t b_rows = (transB == CUBLAS_OP_N) ? k : n;
cublasLtMatrixLayout_t layout_A = nullptr, layout_B = nullptr, layout_C = nullptr;
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_A, dtype, a_rows, (transA == CUBLAS_OP_N) ? k : m, lda));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_B, dtype, b_rows, (transB == CUBLAS_OP_N) ? n : k, ldb));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_C, dtype, m, n, ldc));
// Heuristic selection: query with generous workspace cap, allocate only what's needed.
cublasLtMatmulPreference_t preference = nullptr;
EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceCreate(&preference));
size_t max_ws = kCublasLtMaxWorkspaceBytes;
EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
&max_ws, sizeof(max_ws)));
cublasLtMatmulHeuristicResult_t result;
int returned_results = 0;
cublasStatus_t heuristic_status = cublasLtMatmulAlgoGetHeuristic(lt_handle, matmul_desc, layout_A, layout_B, layout_C,
layout_C, preference, 1, &result, &returned_results);
if (heuristic_status == CUBLAS_STATUS_SUCCESS && returned_results > 0) {
// cublasLt path: use the selected algorithm with lazy workspace.
const size_t needed = result.workspaceSize;
if (needed > workspace->size()) {
// Sync only when freeing an existing buffer that may be in use.
if (workspace->ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
*workspace = DeviceBuffer(needed);
}
EIGEN_CUBLASLT_CHECK(cublasLtMatmul(lt_handle, matmul_desc, alpha, A, layout_A, B, layout_B, beta, C, layout_C, C,
layout_C, &result.algo, workspace->ptr, needed, stream));
} else {
// Fallback: cublasGemmEx for shapes/types that cublasLt cannot handle.
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_handle, transA, transB, static_cast<int>(m), static_cast<int>(n),
static_cast<int>(k), alpha, A, dtype, static_cast<int>(lda), B, dtype,
static_cast<int>(ldb), beta, C, dtype, static_cast<int>(ldc), compute,
cuda_gemm_algo()));
}
// Cleanup descriptors.
EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceDestroy(preference));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_C));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_B));
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_A));
EIGEN_CUBLASLT_CHECK(cublasLtMatmulDescDestroy(matmul_desc));
}
// ---- Type-specific cuBLAS wrappers ------------------------------------------ // ---- Type-specific cuBLAS wrappers ------------------------------------------
// cuBLAS uses separate functions per type (Strsm, Dtrsm, etc.). // cuBLAS uses separate functions per type (Strsm, Dtrsm, etc.).
@@ -227,6 +327,100 @@ inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cu
reinterpret_cast<const cuDoubleComplex*>(B), ldb, reinterpret_cast<cuDoubleComplex*>(C), ldc); reinterpret_cast<const cuDoubleComplex*>(B), ldb, reinterpret_cast<cuDoubleComplex*>(C), ldc);
} }
// ---- cuBLAS Level-1 wrappers ------------------------------------------------
// Type-dispatched wrappers for BLAS-1 vector operations: dot, axpy, nrm2, scal, copy.
// These work with CUBLAS_POINTER_MODE_HOST or CUBLAS_POINTER_MODE_DEVICE depending
// on the caller's configuration. For device pointer mode, scalar result pointers
// (dot, nrm2) must point to device memory.
// dot: result = x^T * y (real) or x^H * y (complex conjugate dot)
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const float* x, int incx, const float* y, int incy,
float* result) {
return cublasSdot(h, n, x, incx, y, incy, result);
}
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const double* x, int incx, const double* y, int incy,
double* result) {
return cublasDdot(h, n, x, incx, y, incy, result);
}
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const std::complex<float>* x, int incx,
const std::complex<float>* y, int incy, std::complex<float>* result) {
return cublasCdotc(h, n, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<const cuComplex*>(y), incy,
reinterpret_cast<cuComplex*>(result));
}
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const std::complex<double>* x, int incx,
const std::complex<double>* y, int incy, std::complex<double>* result) {
return cublasZdotc(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx,
reinterpret_cast<const cuDoubleComplex*>(y), incy, reinterpret_cast<cuDoubleComplex*>(result));
}
// nrm2: result = ||x||_2 (always returns real)
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const float* x, int incx, float* result) {
return cublasSnrm2(h, n, x, incx, result);
}
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const double* x, int incx, double* result) {
return cublasDnrm2(h, n, x, incx, result);
}
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const std::complex<float>* x, int incx, float* result) {
return cublasScnrm2(h, n, reinterpret_cast<const cuComplex*>(x), incx, result);
}
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const std::complex<double>* x, int incx, double* result) {
return cublasDznrm2(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx, result);
}
// axpy: y += alpha * x
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const float* alpha, const float* x, int incx, float* y,
int incy) {
return cublasSaxpy(h, n, alpha, x, incx, y, incy);
}
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const double* alpha, const double* x, int incx, double* y,
int incy) {
return cublasDaxpy(h, n, alpha, x, incx, y, incy);
}
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const std::complex<float>* alpha,
const std::complex<float>* x, int incx, std::complex<float>* y, int incy) {
return cublasCaxpy(h, n, reinterpret_cast<const cuComplex*>(alpha), reinterpret_cast<const cuComplex*>(x), incx,
reinterpret_cast<cuComplex*>(y), incy);
}
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const std::complex<double>* alpha,
const std::complex<double>* x, int incx, std::complex<double>* y, int incy) {
return cublasZaxpy(h, n, reinterpret_cast<const cuDoubleComplex*>(alpha), reinterpret_cast<const cuDoubleComplex*>(x),
incx, reinterpret_cast<cuDoubleComplex*>(y), incy);
}
// scal: x *= alpha
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const float* alpha, float* x, int incx) {
return cublasSscal(h, n, alpha, x, incx);
}
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const double* alpha, double* x, int incx) {
return cublasDscal(h, n, alpha, x, incx);
}
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const std::complex<float>* alpha, std::complex<float>* x,
int incx) {
return cublasCscal(h, n, reinterpret_cast<const cuComplex*>(alpha), reinterpret_cast<cuComplex*>(x), incx);
}
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const std::complex<double>* alpha, std::complex<double>* x,
int incx) {
return cublasZscal(h, n, reinterpret_cast<const cuDoubleComplex*>(alpha), reinterpret_cast<cuDoubleComplex*>(x),
incx);
}
// copy: y = x
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const float* x, int incx, float* y, int incy) {
return cublasScopy(h, n, x, incx, y, incy);
}
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const double* x, int incx, double* y, int incy) {
return cublasDcopy(h, n, x, incx, y, incy);
}
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const std::complex<float>* x, int incx,
std::complex<float>* y, int incy) {
return cublasCcopy(h, n, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<cuComplex*>(y), incy);
}
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const std::complex<double>* x, int incx,
std::complex<double>* y, int incy) {
return cublasZcopy(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx, reinterpret_cast<cuDoubleComplex*>(y),
incy);
}
} // namespace internal } // namespace internal
} // namespace Eigen } // namespace Eigen

View File

@@ -29,10 +29,11 @@ namespace Eigen {
namespace internal { namespace internal {
// ---- GEMM dispatch ---------------------------------------------------------- // ---- GEMM dispatch ----------------------------------------------------------
// GemmExpr<Lhs, Rhs> → cublasGemmEx(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) // GemmExpr<Lhs, Rhs> → cublasLtMatmul via GpuContext.
// //
// The generic API cublasGemmEx handles all scalar types (float, double, // Uses cublasLtMatmul for 64-bit dimension support and heuristic algorithm
// complex<float>, complex<double>) via cudaDataType_t. // selection. All scalar types (float, double, complex<float>, complex<double>)
// are handled via cudaDataType_t.
template <typename Lhs, typename Rhs> template <typename Lhs, typename Rhs>
void dispatch_gemm( void dispatch_gemm(
@@ -46,6 +47,10 @@ void dispatch_gemm(
const DeviceMatrix<Scalar>& A = traits_lhs::matrix(expr.lhs()); const DeviceMatrix<Scalar>& A = traits_lhs::matrix(expr.lhs());
const DeviceMatrix<Scalar>& B = traits_rhs::matrix(expr.rhs()); const DeviceMatrix<Scalar>& B = traits_rhs::matrix(expr.rhs());
// cuBLAS GEMM: C must not alias A or B (undefined behavior).
eigen_assert(dst.data() != A.data() && "GEMM: output aliases left operand (use a temporary)");
eigen_assert(dst.data() != B.data() && "GEMM: output aliases right operand (use a temporary)");
constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op); constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op);
constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op); constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op);
@@ -89,13 +94,8 @@ void dispatch_gemm(
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream())); EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
} }
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value; cublaslt_gemm<Scalar>(ctx.cublasLtHandle(), ctx.cublasHandle(), transA, transB, m, n, k, &alpha_gval, A.data(), lda,
constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value; B.data(), ldb, &beta_gval, dst.data(), ldc, ctx.gemmWorkspace(), ctx.stream());
EIGEN_CUBLAS_CHECK(cublasGemmEx(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n),
static_cast<int>(k), &alpha_gval, A.data(), dtype, static_cast<int>(lda), B.data(),
dtype, static_cast<int>(ldb), &beta_gval, dst.data(), dtype, static_cast<int>(ldc),
compute, cuda_gemm_algo()));
dst.recordReady(ctx.stream()); dst.recordReady(ctx.stream());
} }
@@ -504,6 +504,284 @@ void DeviceSelfAdjointView<Scalar_, UpLo_>::rankUpdate(const DeviceMatrix<Scalar
internal::dispatch_syrk(GpuContext::threadLocal(), matrix(), expr, alpha, beta); internal::dispatch_syrk(GpuContext::threadLocal(), matrix(), expr, alpha, beta);
} }
// ---- DeviceMatrix BLAS-1 out-of-line definitions ----------------------------
// Defined here because they need the full GpuContext definition.
// All methods take an explicit GpuContext& so callers can ensure same-stream
// execution (zero event overhead when all operations share one context).
//
// Reduction methods (dot, norm, squaredNorm) use CUBLAS_POINTER_MODE_HOST:
// the scalar result is written to host memory and cuBLAS synchronizes
// internally before returning. This is necessary for Eigen template
// compatibility — CG does `Scalar alpha = absNew / p.dot(tmp)` which
// requires the host value immediately. A future GPU CG implementation
// that controls the iteration loop can use CUBLAS_POINTER_MODE_DEVICE
// to batch multiple reductions into a single sync point.
template <typename Scalar_>
DeviceScalar<typename DeviceMatrix<Scalar_>::Scalar> DeviceMatrix<Scalar_>::dot(GpuContext& ctx,
const DeviceMatrix& other) const {
const int n = static_cast<int>(rows_ * cols_);
eigen_assert(n == static_cast<int>(other.rows_ * other.cols_));
DeviceScalar<Scalar> result(Scalar(0), ctx.stream());
if (n > 0) {
waitReady(ctx.stream());
other.waitReady(ctx.stream());
cublasPointerMode_t prev;
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
EIGEN_CUBLAS_CHECK(internal::cublasXdot(ctx.cublasHandle(), n, data_, 1, other.data_, 1, result.devicePtr()));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
}
return result;
}
namespace internal {
// Real: dot(x,x) returns DeviceScalar<Scalar> which IS DeviceScalar<RealScalar>.
// Move-construct without any sync.
template <typename Scalar, typename RealScalar>
typename std::enable_if<std::is_same<Scalar, RealScalar>::value, DeviceScalar<RealScalar>>::type squaredNorm_from_dot(
DeviceScalar<Scalar>&& d, cudaStream_t) {
return std::move(d);
}
// Complex: must sync to extract the real part (DeviceScalar arithmetic is real-only).
template <typename Scalar, typename RealScalar>
typename std::enable_if<!std::is_same<Scalar, RealScalar>::value, DeviceScalar<RealScalar>>::type squaredNorm_from_dot(
DeviceScalar<Scalar>&& d, cudaStream_t stream) {
return DeviceScalar<RealScalar>(numext::real(Scalar(d)), stream);
}
} // namespace internal
template <typename Scalar_>
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::squaredNorm(GpuContext& ctx) const {
// Use dot(x,x) instead of nrm2()^2: dot kernel is ~4.5x faster than nrm2
// (nrm2 uses a numerically careful scaled-sum-of-squares algorithm that is
// unnecessary for CG convergence checks).
using RealScalar = typename NumTraits<Scalar_>::Real;
return internal::squaredNorm_from_dot<Scalar_, RealScalar>(dot(ctx, *this), ctx.stream());
}
template <typename Scalar_>
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::norm(GpuContext& ctx) const {
using RealScalar = typename NumTraits<Scalar>::Real;
const int n = static_cast<int>(rows_ * cols_);
DeviceScalar<RealScalar> result(RealScalar(0), ctx.stream());
if (n > 0) {
waitReady(ctx.stream());
cublasPointerMode_t prev;
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
EIGEN_CUBLAS_CHECK(internal::cublasXnrm2(ctx.cublasHandle(), n, data_, 1, result.devicePtr()));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
}
return result;
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::setZero(GpuContext& ctx) {
if (sizeInBytes() > 0) {
waitReady(ctx.stream());
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(data_, 0, sizeInBytes(), ctx.stream()));
recordReady(ctx.stream());
}
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::addScaled(GpuContext& ctx, Scalar alpha, const DeviceMatrix& x) {
const int n = static_cast<int>(rows_ * cols_);
eigen_assert(n == static_cast<int>(x.rows_ * x.cols_));
if (n > 0) {
waitReady(ctx.stream());
x.waitReady(ctx.stream());
EIGEN_CUBLAS_CHECK(internal::cublasXaxpy(ctx.cublasHandle(), n, &alpha, x.data_, 1, data_, 1));
recordReady(ctx.stream());
}
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::scale(GpuContext& ctx, Scalar alpha) {
const int n = static_cast<int>(rows_ * cols_);
if (n > 0) {
waitReady(ctx.stream());
EIGEN_CUBLAS_CHECK(internal::cublasXscal(ctx.cublasHandle(), n, &alpha, data_, 1));
recordReady(ctx.stream());
}
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::copyFrom(GpuContext& ctx, const DeviceMatrix& other) {
// Wait on *this before resize — resize may free the old buffer while another
// stream is still reading it.
if (!empty()) waitReady(ctx.stream());
resize(other.rows_, other.cols_);
const int n = static_cast<int>(rows_ * cols_);
if (n > 0) {
other.waitReady(ctx.stream());
EIGEN_CUBLAS_CHECK(internal::cublasXcopy(ctx.cublasHandle(), n, other.data_, 1, data_, 1));
recordReady(ctx.stream());
}
}
// ---- BLAS-1 operator overloads for CG compatibility -------------------------
// this += alpha * x (axpy)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceScaled<DeviceMatrix>& expr) {
addScaled(GpuContext::threadLocal(), expr.scalar(), internal::device_expr_traits<DeviceMatrix>::matrix(expr.inner()));
return *this;
}
// this -= alpha * x (axpy with negated alpha)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceScaled<DeviceMatrix>& expr) {
addScaled(GpuContext::threadLocal(), -expr.scalar(),
internal::device_expr_traits<DeviceMatrix>::matrix(expr.inner()));
return *this;
}
// this += x (axpy with alpha=1)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceMatrix& other) {
Scalar one(1);
addScaled(GpuContext::threadLocal(), one, other);
return *this;
}
// this -= x (axpy with alpha=-1)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceMatrix& other) {
Scalar neg_one(-1);
addScaled(GpuContext::threadLocal(), neg_one, other);
return *this;
}
// this *= alpha (scal, host pointer)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator*=(Scalar alpha) {
scale(GpuContext::threadLocal(), alpha);
return *this;
}
// this *= alpha (scal, device pointer — avoids host sync)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator*=(const DeviceScalar<Scalar>& alpha) {
const int n = static_cast<int>(rows_ * cols_);
if (n > 0) {
auto& ctx = GpuContext::threadLocal();
waitReady(ctx.stream());
cublasPointerMode_t prev;
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
EIGEN_CUBLAS_CHECK(internal::cublasXscal(ctx.cublasHandle(), n, alpha.devicePtr(), data_, 1));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
recordReady(ctx.stream());
}
return *this;
}
// this += DeviceScalar * x (axpy with CUBLAS_POINTER_MODE_DEVICE)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceScaledDevice<Scalar_>& expr) {
const int n = static_cast<int>(rows_ * cols_);
const auto& x = expr.matrix();
eigen_assert(n == static_cast<int>(x.rows_ * x.cols_));
if (n > 0) {
auto& ctx = GpuContext::threadLocal();
waitReady(ctx.stream());
x.waitReady(ctx.stream());
cublasPointerMode_t prev;
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
EIGEN_CUBLAS_CHECK(internal::cublasXaxpy(ctx.cublasHandle(), n, expr.alpha().devicePtr(), x.data_, 1, data_, 1));
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
recordReady(ctx.stream());
}
return *this;
}
// this -= DeviceScalar * x (axpy with negated device scalar)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceScaledDevice<Scalar_>& expr) {
auto neg_alpha = -expr.alpha();
DeviceScaledDevice<Scalar_> neg_expr(neg_alpha, expr.matrix());
return operator+=(neg_expr);
}
// this = alpha * A + beta * B (cuBLAS geam)
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const DeviceAddExpr<Scalar_>& expr) {
auto& ctx = GpuContext::threadLocal();
const auto& A = expr.A();
const auto& B = expr.B();
eigen_assert(A.rows() == B.rows() && A.cols() == B.cols());
const int m = static_cast<int>(A.rows());
const int n = static_cast<int>(A.cols());
// Wait on *this before resize — resize may free the old buffer while another
// stream is still reading it.
if (!empty()) waitReady(ctx.stream());
resize(A.rows(), A.cols());
if (m > 0 && n > 0) {
A.waitReady(ctx.stream());
B.waitReady(ctx.stream());
Scalar_ alpha = expr.alpha();
Scalar_ beta = expr.beta();
EIGEN_CUBLAS_CHECK(internal::cublasXgeam(ctx.cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, &alpha, A.data(), m,
&beta, B.data(), m, data_, m));
recordReady(ctx.stream());
}
return *this;
}
// cwiseProduct via NPP nppsMul (allocating).
template <typename Scalar_>
DeviceMatrix<Scalar_> DeviceMatrix<Scalar_>::cwiseProduct(GpuContext& ctx, const DeviceMatrix& other) const {
const int n = static_cast<int>(rows_ * cols_);
eigen_assert(n == static_cast<int>(other.rows_ * other.cols_));
DeviceMatrix result(rows_, cols_);
if (n > 0) {
waitReady(ctx.stream());
other.waitReady(ctx.stream());
internal::device_cwiseProduct(data_, other.data_, result.data_, n, ctx.stream());
result.recordReady(ctx.stream());
}
return result;
}
// In-place cwiseProduct: this = a .* b (reuses this buffer, no allocation).
template <typename Scalar_>
void DeviceMatrix<Scalar_>::cwiseProduct(GpuContext& ctx, const DeviceMatrix& a, const DeviceMatrix& b) {
const int n = static_cast<int>(a.rows_ * a.cols_);
eigen_assert(n == static_cast<int>(b.rows_ * b.cols_));
if (!empty()) waitReady(ctx.stream());
resize(a.rows_, a.cols_);
if (n > 0) {
a.waitReady(ctx.stream());
b.waitReady(ctx.stream());
internal::device_cwiseProduct(a.data_, b.data_, data_, n, ctx.stream());
recordReady(ctx.stream());
}
}
// Convenience overloads using thread-local default GpuContext.
template <typename Scalar_>
DeviceScalar<typename DeviceMatrix<Scalar_>::Scalar> DeviceMatrix<Scalar_>::dot(const DeviceMatrix& other) const {
return dot(GpuContext::threadLocal(), other);
}
template <typename Scalar_>
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::squaredNorm() const {
return squaredNorm(GpuContext::threadLocal());
}
template <typename Scalar_>
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::norm() const {
return norm(GpuContext::threadLocal());
}
template <typename Scalar_>
void DeviceMatrix<Scalar_>::setZero() {
setZero(GpuContext::threadLocal());
}
} // namespace Eigen } // namespace Eigen
#endif // EIGEN_GPU_DEVICE_DISPATCH_H #endif // EIGEN_GPU_DEVICE_DISPATCH_H

View File

@@ -219,6 +219,87 @@ DeviceScaled<DeviceTransposeView<S>> operator*(S alpha, const DeviceTransposeVie
return {alpha, m}; return {alpha, m};
} }
// ---- DeviceScaledDevice: DeviceScalar * DeviceMatrix → device-pointer axpy ---
// Like DeviceScaled but carries a DeviceScalar (device pointer) instead of
// a host scalar. operator+= dispatches to cuBLAS axpy with POINTER_MODE_DEVICE.
template <typename Scalar_>
class DeviceScaledDevice {
public:
using Scalar = Scalar_;
DeviceScaledDevice(const DeviceScalar<Scalar>& alpha, const DeviceMatrix<Scalar>& mat) : alpha_(alpha), mat_(mat) {}
const DeviceScalar<Scalar>& alpha() const { return alpha_; }
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
private:
const DeviceScalar<Scalar>& alpha_;
const DeviceMatrix<Scalar>& mat_;
};
// DeviceScalar * DeviceMatrix → DeviceScaledDevice
template <typename S>
DeviceScaledDevice<S> operator*(const DeviceScalar<S>& alpha, const DeviceMatrix<S>& m) {
return {alpha, m};
}
// ---- DeviceAddExpr: a + b → cublasXgeam -------------------------------------
// Captures `DeviceMatrix + DeviceScaled<DeviceMatrix>` (and reverse).
// Dispatched to geam: C = alpha * A + beta * B.
//
// Note: These operator+/- overloads are intentionally free functions on
// DeviceMatrix, not Eigen expression templates. DeviceMatrix does not inherit
// from MatrixBase, so there is no ambiguity with Eigen's own operator+/-.
// If DeviceMatrix is ever made an Eigen expression type, these would need to
// be revisited.
template <typename Scalar_>
class DeviceAddExpr {
public:
using Scalar = Scalar_;
DeviceAddExpr(Scalar alpha, const DeviceMatrix<Scalar>& A, Scalar beta, const DeviceMatrix<Scalar>& B)
: alpha_(alpha), A_(A), beta_(beta), B_(B) {}
Scalar alpha() const { return alpha_; }
Scalar beta() const { return beta_; }
const DeviceMatrix<Scalar>& A() const { return A_; }
const DeviceMatrix<Scalar>& B() const { return B_; }
private:
Scalar alpha_;
const DeviceMatrix<Scalar>& A_;
Scalar beta_;
const DeviceMatrix<Scalar>& B_;
};
// DeviceMatrix + DeviceMatrix → DeviceAddExpr (alpha=1, beta=1)
template <typename S>
DeviceAddExpr<S> operator+(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
return {S(1), a, S(1), b};
}
// DeviceMatrix + DeviceScaled<DeviceMatrix> → DeviceAddExpr (alpha=1, beta=scaled)
template <typename S>
DeviceAddExpr<S> operator+(const DeviceMatrix<S>& a, const DeviceScaled<DeviceMatrix<S>>& b) {
return {S(1), a, b.scalar(), b.inner()};
}
// DeviceScaled<DeviceMatrix> + DeviceMatrix → DeviceAddExpr (alpha=scaled, beta=1)
template <typename S>
DeviceAddExpr<S> operator+(const DeviceScaled<DeviceMatrix<S>>& a, const DeviceMatrix<S>& b) {
return {a.scalar(), a.inner(), S(1), b};
}
// DeviceMatrix - DeviceMatrix → DeviceAddExpr (alpha=1, beta=-1)
template <typename S>
DeviceAddExpr<S> operator-(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
return {S(1), a, S(-1), b};
}
// DeviceMatrix - DeviceScaled<DeviceMatrix> → DeviceAddExpr (alpha=1, beta=-scaled)
template <typename S>
DeviceAddExpr<S> operator-(const DeviceMatrix<S>& a, const DeviceScaled<DeviceMatrix<S>>& b) {
return {S(1), a, -b.scalar(), b.inner()};
}
} // namespace Eigen } // namespace Eigen
#endif // EIGEN_GPU_DEVICE_EXPR_H #endif // EIGEN_GPU_DEVICE_EXPR_H

View File

@@ -53,6 +53,16 @@ template <typename>
class DeviceAssignment; class DeviceAssignment;
template <typename, typename> template <typename, typename>
class GemmExpr; class GemmExpr;
template <typename>
class DeviceScaled;
template <typename>
class SpMVExpr;
template <typename>
class DeviceAddExpr;
template <typename>
class DeviceScaledDevice;
template <typename>
class DeviceScalar;
template <typename, int> template <typename, int>
class LltSolveExpr; class LltSolveExpr;
template <typename> template <typename>
@@ -170,6 +180,8 @@ template <typename Scalar_>
class DeviceMatrix { class DeviceMatrix {
public: public:
using Scalar = Scalar_; using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using PlainObject = DeviceMatrix; // owning type (for CG template compatibility)
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>; using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
// ---- Construction / destruction ------------------------------------------ // ---- Construction / destruction ------------------------------------------
@@ -177,6 +189,16 @@ class DeviceMatrix {
/** Default: empty (0x0, no allocation). */ /** Default: empty (0x0, no allocation). */
DeviceMatrix() = default; DeviceMatrix() = default;
/** Allocate uninitialized column vector of given size.
* Matches Matrix<Scalar,Dynamic,1>(n) for CG template compatibility. */
explicit DeviceMatrix(Index n) : rows_(n), cols_(1) {
eigen_assert(n >= 0);
size_t bytes = sizeInBytes();
if (bytes > 0) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
}
}
/** Allocate uninitialized device memory for a rows x cols matrix. */ /** Allocate uninitialized device memory for a rows x cols matrix. */
DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols) { DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols) {
eigen_assert(rows >= 0 && cols >= 0); eigen_assert(rows >= 0 && cols >= 0);
@@ -446,6 +468,104 @@ class DeviceMatrix {
template <int UpLo> template <int UpLo>
DeviceMatrix& operator=(const SymmExpr<Scalar, UpLo>& expr); DeviceMatrix& operator=(const SymmExpr<Scalar, UpLo>& expr);
// ---- BLAS Level-1 operations ----------------------------------------------
// DeviceMatrix is always dense (lda == rows), so a vector is simply a
// DeviceMatrix with cols == 1. These BLAS-1 methods operate on the flat
// rows*cols element array, making them work for both vectors and matrices.
//
// All methods take an explicit GpuContext& for stream/handle control.
// When everything uses the same context, event waits are skipped (same-stream).
// Defined out-of-line in DeviceDispatch.h (needs GpuContext).
/** Dot product: this^H * other. Returns DeviceScalar — the result stays
* on device until read via implicit conversion to Scalar (which syncs).
* When used with `auto`, no sync occurs until the value is needed. */
DeviceScalar<Scalar> dot(GpuContext& ctx, const DeviceMatrix& other) const;
/** Squared L2 norm via dot(x, x). Returns DeviceScalar (no sync until read).
* For real types, the result stays on device. For complex types, falls back
* to host sync (DeviceScalar arithmetic is real-only). */
DeviceScalar<typename NumTraits<Scalar>::Real> squaredNorm(GpuContext& ctx) const;
/** L2 norm. Returns DeviceScalar (no host sync). */
DeviceScalar<typename NumTraits<Scalar>::Real> norm(GpuContext& ctx) const;
/** Set all elements to zero. */
void setZero(GpuContext& ctx);
/** this += alpha * x (cuBLAS axpy). Requires same total size. */
void addScaled(GpuContext& ctx, Scalar alpha, const DeviceMatrix& x);
/** this *= alpha (cuBLAS scal). */
void scale(GpuContext& ctx, Scalar alpha);
/** Deep copy: this = other (cuBLAS copy). Resizes if needed. */
void copyFrom(GpuContext& ctx, const DeviceMatrix& other);
// Convenience overloads using the thread-local default GpuContext.
DeviceScalar<Scalar> dot(const DeviceMatrix& other) const;
DeviceScalar<typename NumTraits<Scalar>::Real> squaredNorm() const;
DeviceScalar<typename NumTraits<Scalar>::Real> norm() const;
void setZero();
// ---- BLAS-1 operator overloads for CG/iterative solver compatibility ------
// These allow CG code like `x += alpha * p` to work with DeviceMatrix.
// `alpha * DeviceMatrix` already returns `DeviceScaled<DeviceMatrix<Scalar>>`
// (defined in DeviceExpr.h). These operators dispatch to cuBLAS axpy/scal.
// Defined out-of-line in DeviceDispatch.h.
/** this += alpha * x (cuBLAS axpy). For `x += alpha * p`. */
DeviceMatrix& operator+=(const DeviceScaled<DeviceMatrix>& expr);
/** this -= alpha * x (cuBLAS axpy with negated alpha). For `r -= alpha * tmp`. */
DeviceMatrix& operator-=(const DeviceScaled<DeviceMatrix>& expr);
/** this += x (cuBLAS axpy with alpha=1). */
DeviceMatrix& operator+=(const DeviceMatrix& other);
/** this -= x (cuBLAS axpy with alpha=-1). */
DeviceMatrix& operator-=(const DeviceMatrix& other);
/** this *= alpha (cuBLAS scal, host pointer mode). For `p *= beta`. */
DeviceMatrix& operator*=(Scalar alpha);
/** this *= alpha (cuBLAS scal, device pointer mode). Avoids host sync. */
DeviceMatrix& operator*=(const DeviceScalar<Scalar>& alpha);
/** Element-wise product: result[i] = this[i] * other[i] (NPP nppsMul).
* Returns a new DeviceMatrix. Defined out-of-line in DeviceDispatch.h. */
DeviceMatrix cwiseProduct(GpuContext& ctx, const DeviceMatrix& other) const;
/** In-place element-wise product: this[i] = a[i] * b[i] (NPP nppsMul).
* Reuses this matrix's buffer when sizes match, avoiding cudaMalloc. */
void cwiseProduct(GpuContext& ctx, const DeviceMatrix& a, const DeviceMatrix& b);
/** this += DeviceScalar * x (cuBLAS axpy with POINTER_MODE_DEVICE). */
DeviceMatrix& operator+=(const DeviceScaledDevice<Scalar>& expr);
/** this -= DeviceScalar * x (cuBLAS axpy with negated device scalar). */
DeviceMatrix& operator-=(const DeviceScaledDevice<Scalar>& expr);
/** Assign from an SpMV expression: d_y = d_A * d_x. */
DeviceMatrix& operator=(const SpMVExpr<Scalar>& expr);
/** Assign from an add expression: d_C = alpha * d_A + beta * d_B (cuBLAS geam). */
DeviceMatrix& operator=(const DeviceAddExpr<Scalar>& expr);
/** No-op — all DeviceMatrix operations are implicitly noalias.
*
* Unlike Eigen's Matrix, where omitting .noalias() triggers a copy to a
* temporary for safety, DeviceMatrix dispatches directly to NVIDIA library
* calls which have no built-in aliasing protection. Every assignment
* (`d_C = d_A * d_B`, `d_y = d_A * d_x`, etc.) behaves as if .noalias()
* were specified. The caller must ensure operands don't alias the
* destination for GEMM and SpMV. geam (`d_C = d_A + alpha * d_B`) is
* safe with aliasing. Debug asserts catch violations.
*
* This method exists so that `tmp.noalias() = mat * p` compiles for both
* Matrix and DeviceMatrix. */
DeviceMatrix& noalias() { return *this; }
private: private:
// ---- Private: adopt a raw device pointer (used by friend solvers) -------- // ---- Private: adopt a raw device pointer (used by friend solvers) --------

View File

@@ -0,0 +1,121 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Device-resident scalar for deferred host synchronization.
//
// DeviceScalar<Scalar> wraps a single value in device memory. Reductions
// (dot, nrm2) write results directly to device memory via
// CUBLAS_POINTER_MODE_DEVICE, deferring host sync until the value is read.
//
// Implicit conversion to Scalar triggers cudaStreamSynchronize + download.
// In CG, this reduces 3 syncs/iter to effectively 1: the first conversion
// syncs the stream, subsequent conversions in the same expression just
// download (the stream is already flushed).
//
// Usage:
// auto dot_val = d_x.dot(d_y); // DeviceScalar, no sync
// auto norm_val = d_r.squaredNorm(); // DeviceScalar, no sync
// Scalar alpha = absNew / dot_val; // sync here (both values downloaded)
// d_x += alpha * d_p; // host-scalar axpy (as before)
#ifndef EIGEN_GPU_DEVICE_SCALAR_H
#define EIGEN_GPU_DEVICE_SCALAR_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSupport.h"
#include "./DeviceScalarOps.h"
namespace Eigen {
template <typename Scalar_>
class DeviceScalar {
public:
using Scalar = Scalar_;
/** Allocate uninitialized device scalar. Contents are undefined until written
* (e.g., by cuBLAS dot/nrm2 with POINTER_MODE_DEVICE). Consistent with
* DeviceMatrix(rows, cols) which also does not zero-initialize. */
explicit DeviceScalar(cudaStream_t stream = nullptr) : d_val_(sizeof(Scalar)), stream_(stream) {}
DeviceScalar(Scalar host_val, cudaStream_t stream) : d_val_(sizeof(Scalar)), stream_(stream) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_val_.ptr, &host_val, sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
}
DeviceScalar(DeviceScalar&& o) noexcept : d_val_(std::move(o.d_val_)), stream_(o.stream_) { o.stream_ = nullptr; }
DeviceScalar& operator=(DeviceScalar&& o) noexcept {
if (this != &o) {
d_val_ = std::move(o.d_val_);
stream_ = o.stream_;
o.stream_ = nullptr;
}
return *this;
}
DeviceScalar(const DeviceScalar&) = delete;
DeviceScalar& operator=(const DeviceScalar&) = delete;
/** Download from device. Synchronizes the stream on first call;
* subsequent calls in the same expression are cheap (stream already flushed). */
Scalar get() const {
Scalar result;
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&result, d_val_.ptr, sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
return result;
}
/** Implicit conversion — allows `Scalar alpha = deviceScalar` and
* `if (deviceScalar < threshold)`. Triggers sync. */
operator Scalar() const { return get(); }
Scalar* devicePtr() { return static_cast<Scalar*>(d_val_.ptr); }
const Scalar* devicePtr() const { return static_cast<const Scalar*>(d_val_.ptr); }
cudaStream_t stream() const { return stream_; }
// ---- Device-side arithmetic (no host sync) ---------------------------------
// Uses NPP from DeviceScalarOps.h. All results stay on device.
// Currently supports real types only (float, double). Complex types
// fall back to implicit conversion (host sync) for division.
//
// Note: DeviceScalar has no cross-stream readiness tracking. All
// operations must be on the same CUDA stream. This is the natural
// pattern in iterative solvers where one GpuContext owns all work.
friend DeviceScalar operator/(const DeviceScalar& a, const DeviceScalar& b) {
DeviceScalar result(a.stream_);
internal::device_scalar_div(a.devicePtr(), b.devicePtr(), result.devicePtr(), a.stream_);
return result;
}
friend DeviceScalar operator/(Scalar a, const DeviceScalar& b) {
DeviceScalar d_a(a, b.stream_);
return d_a / b;
}
friend DeviceScalar operator/(const DeviceScalar& a, Scalar b) {
DeviceScalar d_b(b, a.stream_);
return a / d_b;
}
DeviceScalar operator-() const {
DeviceScalar result(stream_);
internal::device_scalar_neg(devicePtr(), result.devicePtr(), stream_);
return result;
}
private:
internal::DeviceBuffer d_val_;
cudaStream_t stream_ = nullptr;
};
} // namespace Eigen
#endif // EIGEN_GPU_DEVICE_SCALAR_H

View File

@@ -0,0 +1,117 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Device-resident scalar and element-wise operations via NPP signals.
// Header-only — no custom CUDA kernels needed. Uses nppsDiv, nppsMul,
// nppsMulC from the NPP library (CUDA::npps, part of the CUDA toolkit).
#ifndef EIGEN_GPU_DEVICE_SCALAR_OPS_H
#define EIGEN_GPU_DEVICE_SCALAR_OPS_H
#include <cuda_runtime.h>
#include <npps_arithmetic_and_logical_operations.h>
namespace Eigen {
namespace internal {
// ---- NppStreamContext helper ------------------------------------------------
inline NppStreamContext make_npp_stream_ctx(cudaStream_t stream) {
// Cache device attributes (constant for process lifetime) in a thread-local.
// Only the stream and its flags vary per call.
struct CachedDeviceInfo {
bool initialized = false;
int device_id = 0;
int cc_major = 0;
int cc_minor = 0;
int mp_count = 0;
int max_threads_per_mp = 0;
int max_threads_per_block = 0;
int shared_mem_per_block = 0;
void init() {
if (initialized) return;
cudaGetDevice(&device_id);
cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device_id);
cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device_id);
cudaDeviceGetAttribute(&mp_count, cudaDevAttrMultiProcessorCount, device_id);
cudaDeviceGetAttribute(&max_threads_per_mp, cudaDevAttrMaxThreadsPerMultiProcessor, device_id);
cudaDeviceGetAttribute(&max_threads_per_block, cudaDevAttrMaxThreadsPerBlock, device_id);
cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id);
initialized = true;
}
};
thread_local CachedDeviceInfo cached;
cached.init();
NppStreamContext ctx = {};
ctx.hStream = stream;
ctx.nCudaDeviceId = cached.device_id;
ctx.nCudaDevAttrComputeCapabilityMajor = cached.cc_major;
ctx.nCudaDevAttrComputeCapabilityMinor = cached.cc_minor;
ctx.nMultiProcessorCount = cached.mp_count;
ctx.nMaxThreadsPerMultiProcessor = cached.max_threads_per_mp;
ctx.nMaxThreadsPerBlock = cached.max_threads_per_block;
ctx.nSharedMemPerBlock = cached.shared_mem_per_block;
cudaStreamGetFlags(stream, &ctx.nStreamFlags);
return ctx;
}
// ---- Scalar division: c = a / b (device-resident, async) --------------------
inline void device_scalar_div(const float* a, const float* b, float* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_32f_Ctx(b, a, c, 1, npp_ctx); // NPP: pDst[i] = pSrc2[i] / pSrc1[i]
}
inline void device_scalar_div(const double* a, const double* b, double* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_64f_Ctx(b, a, c, 1, npp_ctx); // NPP: pDst[i] = pSrc2[i] / pSrc1[i]
}
// ---- Scalar negation: c = -a (device-resident, async) -----------------------
inline void device_scalar_neg(const float* a, float* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMulC_32f_Ctx(a, -1.0f, c, 1, npp_ctx);
}
inline void device_scalar_neg(const double* a, double* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMulC_64f_Ctx(a, -1.0, c, 1, npp_ctx);
}
// ---- Element-wise vector multiply: c[i] = a[i] * b[i] ----------------------
inline void device_cwiseProduct(const float* a, const float* b, float* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMul_32f_Ctx(a, b, c, static_cast<size_t>(n), npp_ctx);
}
inline void device_cwiseProduct(const double* a, const double* b, double* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMul_64f_Ctx(a, b, c, static_cast<size_t>(n), npp_ctx);
}
// ---- Element-wise vector division: c[i] = a[i] / b[i] ----------------------
inline void device_cwiseQuotient(const float* a, const float* b, float* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_32f_Ctx(b, a, c, static_cast<size_t>(n), npp_ctx); // NPP: dst = src2 / src1
}
inline void device_cwiseQuotient(const double* a, const double* b, double* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_64f_Ctx(b, a, c, static_cast<size_t>(n), npp_ctx);
}
} // namespace internal
} // namespace Eigen
#endif // EIGEN_GPU_DEVICE_SCALAR_OPS_H

View File

@@ -28,6 +28,8 @@
#include "./CuBlasSupport.h" #include "./CuBlasSupport.h"
#include "./CuSolverSupport.h" #include "./CuSolverSupport.h"
#include <cusparse.h>
#include <cufft.h>
namespace Eigen { namespace Eigen {
@@ -44,38 +46,92 @@ namespace Eigen {
*/ */
class GpuContext { class GpuContext {
public: public:
GpuContext() { /** Create a new context with a dedicated CUDA stream. */
GpuContext() : owns_stream_(true) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_)); init_handles();
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
} }
/** Create a context on an existing stream (e.g., stream 0 = nullptr).
* The caller retains ownership of the stream — this context will not destroy it. */
explicit GpuContext(cudaStream_t stream) : stream_(stream), owns_stream_(false) { init_handles(); }
~GpuContext() { ~GpuContext() {
if (cusparse_) (void)cusparseDestroy(cusparse_);
if (cusolver_) (void)cusolverDnDestroy(cusolver_); if (cusolver_) (void)cusolverDnDestroy(cusolver_);
if (cublas_lt_) (void)cublasLtDestroy(cublas_lt_);
if (cublas_) (void)cublasDestroy(cublas_); if (cublas_) (void)cublasDestroy(cublas_);
if (stream_) (void)cudaStreamDestroy(stream_); if (owns_stream_ && stream_) (void)cudaStreamDestroy(stream_);
} }
// Non-copyable, non-movable (owns library handles). // Non-copyable, non-movable (owns library handles).
GpuContext(const GpuContext&) = delete; GpuContext(const GpuContext&) = delete;
GpuContext& operator=(const GpuContext&) = delete; GpuContext& operator=(const GpuContext&) = delete;
/** Lazily-created thread-local default context. */ /** Get the thread-local default context.
* If setThreadLocal() has been called, returns that context.
* Otherwise lazily creates a new context with a dedicated stream. */
static GpuContext& threadLocal() { static GpuContext& threadLocal() {
GpuContext* override = tl_override_ptr();
if (override) return *override;
thread_local GpuContext ctx; thread_local GpuContext ctx;
return ctx; return ctx;
} }
/** Override the thread-local default context for this thread.
* The caller retains ownership of \p ctx — it must outlive all uses.
* Pass nullptr to restore the lazily-created default. */
static void setThreadLocal(GpuContext* ctx) { tl_override_ptr() = ctx; }
cudaStream_t stream() const { return stream_; } cudaStream_t stream() const { return stream_; }
cublasHandle_t cublasHandle() const { return cublas_; } cublasHandle_t cublasHandle() const { return cublas_; }
cusolverDnHandle_t cusolverHandle() const { return cusolver_; } cusolverDnHandle_t cusolverHandle() const { return cusolver_; }
/** cuBLASLt handle (lazy-initialized on first GEMM call). */
cublasLtHandle_t cublasLtHandle() const {
if (!cublas_lt_) {
EIGEN_CUBLAS_CHECK(cublasLtCreate(&cublas_lt_));
}
return cublas_lt_;
}
/** Workspace buffer for cublasLtMatmul (grown lazily by cublaslt_gemm).
* Not thread-safe — all GEMM calls must be on this context's stream. */
internal::DeviceBuffer* gemmWorkspace() const { return &gemm_workspace_; }
/** cuSPARSE handle (lazy-initialized on first call). */
cusparseHandle_t cusparseHandle() const {
if (!cusparse_) {
cusparseStatus_t s1 = cusparseCreate(&cusparse_);
eigen_assert(s1 == CUSPARSE_STATUS_SUCCESS && "cusparseCreate failed");
EIGEN_UNUSED_VARIABLE(s1);
cusparseStatus_t s2 = cusparseSetStream(cusparse_, stream_);
eigen_assert(s2 == CUSPARSE_STATUS_SUCCESS && "cusparseSetStream failed");
EIGEN_UNUSED_VARIABLE(s2);
}
return cusparse_;
}
private: private:
cudaStream_t stream_ = nullptr; cudaStream_t stream_ = nullptr;
cublasHandle_t cublas_ = nullptr; cublasHandle_t cublas_ = nullptr;
cusolverDnHandle_t cusolver_ = nullptr; cusolverDnHandle_t cusolver_ = nullptr;
mutable cublasLtHandle_t cublas_lt_ = nullptr; // lazy
mutable cusparseHandle_t cusparse_ = nullptr; // lazy
mutable internal::DeviceBuffer gemm_workspace_; // lazy
bool owns_stream_ = true;
static GpuContext*& tl_override_ptr() {
thread_local GpuContext* ptr = nullptr;
return ptr;
}
void init_handles() {
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
}
}; };
} // namespace Eigen } // namespace Eigen

View File

@@ -18,6 +18,7 @@
// GpuSVD<double> svd(A, ComputeThinU | ComputeThinV); // GpuSVD<double> svd(A, ComputeThinU | ComputeThinV);
// VectorXd S = svd.singularValues(); // VectorXd S = svd.singularValues();
// MatrixXd U = svd.matrixU(); // m×k or m×m // MatrixXd U = svd.matrixU(); // m×k or m×m
// MatrixXd V = svd.matrixV(); // n×k or n×n (matches JacobiSVD)
// MatrixXd VT = svd.matrixVT(); // k×n or n×n (this is V^T) // MatrixXd VT = svd.matrixVT(); // k×n or n×n (this is V^T)
// MatrixXd X = svd.solve(B); // pseudoinverse // MatrixXd X = svd.solve(B); // pseudoinverse
// MatrixXd X = svd.solve(B, k); // truncated (top k triplets) // MatrixXd X = svd.solve(B, k); // truncated (top k triplets)
@@ -53,6 +54,7 @@ class GpuSVD {
~GpuSVD() { ~GpuSVD() {
if (handle_) (void)cusolverDnDestroy(handle_); if (handle_) (void)cusolverDnDestroy(handle_);
if (cublas_lt_) (void)cublasLtDestroy(cublas_lt_);
if (cublas_) (void)cublasDestroy(cublas_); if (cublas_) (void)cublasDestroy(cublas_);
if (stream_) (void)cudaStreamDestroy(stream_); if (stream_) (void)cudaStreamDestroy(stream_);
} }
@@ -185,6 +187,10 @@ class GpuSVD {
} }
} }
/** Right singular vectors V. Returns n_orig × k or n_orig × n_orig.
* Equivalent to matrixVT().adjoint(). Matches Eigen's JacobiSVD::matrixV() API. */
PlainMatrix matrixV() const { return matrixVT().adjoint(); }
/** Right singular vectors transposed V^T. Returns k × n_orig or n_orig × n_orig. /** Right singular vectors transposed V^T. Returns k × n_orig or n_orig × n_orig.
* For transposed case, VT comes from cuSOLVER's U. */ * For transposed case, VT comes from cuSOLVER's U. */
PlainMatrix matrixVT() const { PlainMatrix matrixVT() const {
@@ -254,6 +260,8 @@ class GpuSVD {
cudaStream_t stream_ = nullptr; cudaStream_t stream_ = nullptr;
cusolverDnHandle_t handle_ = nullptr; cusolverDnHandle_t handle_ = nullptr;
cublasHandle_t cublas_ = nullptr; cublasHandle_t cublas_ = nullptr;
cublasLtHandle_t cublas_lt_ = nullptr;
mutable internal::DeviceBuffer gemm_workspace_;
internal::CusolverParams params_; internal::CusolverParams params_;
internal::DeviceBuffer d_A_; // working copy of A (overwritten by gesvd) internal::DeviceBuffer d_A_; // working copy of A (overwritten by gesvd)
internal::DeviceBuffer d_U_; // left singular vectors internal::DeviceBuffer d_U_; // left singular vectors
@@ -277,6 +285,7 @@ class GpuSVD {
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_)); EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_)); EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_)); EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
EIGEN_CUBLASLT_CHECK(cublasLtCreate(&cublas_lt_));
ensure_scratch(0); ensure_scratch(0);
} }
@@ -410,23 +419,21 @@ class GpuSVD {
internal::DeviceBuffer d_tmp(static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar)); internal::DeviceBuffer d_tmp(static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar));
{ {
Scalar alpha_one(1), beta_zero(0); Scalar alpha_one(1), beta_zero(0);
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;
if (!transposed_) { if (!transposed_) {
// U_stored^H * B: (m_×kk)^H × (m_×nrhs) → kk×nrhs. // U_stored^H * B: (m_×kk)^H × (m_×nrhs) → kk×nrhs.
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs), internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_C, CUBLAS_OP_N, kk, nrhs, m_, &alpha_one,
static_cast<int>(m_), &alpha_one, d_U_.ptr, dtype, static_cast<int>(m_), static_cast<const Scalar*>(d_U_.ptr), m_, static_cast<const Scalar*>(d_B.ptr),
d_B.ptr, dtype, static_cast<int>(m_orig), &beta_zero, d_tmp.ptr, dtype, m_orig, &beta_zero, static_cast<Scalar*>(d_tmp.ptr), kk, &gemm_workspace_,
static_cast<int>(kk), compute, internal::cuda_gemm_algo())); stream_);
} else { } else {
// VT_stored * B: VT_stored is vtrows×n_ = kk×m_orig (thin), NoTrans. // VT_stored * B: VT_stored is vtrows×n_ = kk×m_orig (thin), NoTrans.
// vtrows×m_orig times m_orig×nrhs → vtrows×nrhs. Use first kk rows. // vtrows×m_orig times m_orig×nrhs → vtrows×nrhs. Use first kk rows.
const Index vtrows_stored = (swap_uv_options(options_) & ComputeFullV) ? n_ : k; const Index vtrows_stored = (swap_uv_options(options_) & ComputeFullV) ? n_ : k;
EIGEN_CUBLAS_CHECK(cublasGemmEx( internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_N, CUBLAS_OP_N, kk, nrhs, m_orig, &alpha_one,
cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs), static_cast<int>(m_orig), static_cast<const Scalar*>(d_VT_.ptr), vtrows_stored,
&alpha_one, d_VT_.ptr, dtype, static_cast<int>(vtrows_stored), d_B.ptr, dtype, static_cast<int>(m_orig), static_cast<const Scalar*>(d_B.ptr), m_orig, &beta_zero,
&beta_zero, d_tmp.ptr, dtype, static_cast<int>(kk), compute, internal::cuda_gemm_algo())); static_cast<Scalar*>(d_tmp.ptr), kk, &gemm_workspace_, stream_);
} }
} }
@@ -458,21 +465,19 @@ class GpuSVD {
{ {
internal::DeviceBuffer d_X(static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar)); internal::DeviceBuffer d_X(static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
Scalar alpha_one(1), beta_zero(0); Scalar alpha_one(1), beta_zero(0);
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;
if (!transposed_) { if (!transposed_) {
const Index vtrows = (options_ & ComputeFullV) ? n_ : k; const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(n_orig), internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_C, CUBLAS_OP_N, n_orig, nrhs, kk, &alpha_one,
static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_VT_.ptr, dtype, static_cast<const Scalar*>(d_VT_.ptr), vtrows,
static_cast<int>(vtrows), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero, static_cast<const Scalar*>(d_tmp.ptr), kk, &beta_zero,
d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo())); static_cast<Scalar*>(d_X.ptr), n_orig, &gemm_workspace_, stream_);
} else { } else {
// U_stored is m_×ucols. V_orig = U_stored[:,:kk]. NoTrans × tmp. // U_stored is m_×ucols. V_orig = U_stored[:,:kk]. NoTrans × tmp.
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(n_orig), internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_N, CUBLAS_OP_N, n_orig, nrhs, kk, &alpha_one,
static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_U_.ptr, dtype, static_cast<const Scalar*>(d_U_.ptr), m_, static_cast<const Scalar*>(d_tmp.ptr),
static_cast<int>(m_), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero, kk, &beta_zero, static_cast<Scalar*>(d_X.ptr), n_orig, &gemm_workspace_,
d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo())); stream_);
} }
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_X.ptr, EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_X.ptr,

View File

@@ -10,16 +10,25 @@
// GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix // GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix
// multiply (SpMM) via cuSPARSE. // multiply (SpMM) via cuSPARSE.
// //
// GpuSparseContext manages a cuSPARSE handle and device buffers. It accepts // GpuSparseContext manages cuSPARSE descriptors and device buffers. It accepts
// Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the // Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the GPU.
// GPU. RowMajor input is implicitly converted to ColMajor. // RowMajor input is implicitly converted to ColMajor.
//
// Can borrow a GpuContext for same-stream execution with BLAS-1 ops (zero
// event overhead in iterative solvers like CG).
// //
// Usage: // Usage:
// // Standalone (own stream):
// GpuSparseContext<double> ctx; // GpuSparseContext<double> ctx;
// VectorXd y = ctx.multiply(A, x); // y = A * x // VectorXd y = ctx.multiply(A, x);
// ctx.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y //
// VectorXd z = ctx.multiplyT(A, x); // z = A^T * x // // Shared context (same stream as BLAS-1 ops):
// MatrixXd Y = ctx.multiplyMat(A, X); // Y = A * X (multiple RHS) // GpuContext gpu_ctx;
// GpuSparseContext<double> sparse_ctx(gpu_ctx);
// VectorXd y = sparse_ctx.multiply(A, x);
//
// // Device-resident (no host roundtrip):
// sparse_ctx.multiply(A, d_x, d_y); // DeviceMatrix in/out
#ifndef EIGEN_GPU_SPARSE_CONTEXT_H #ifndef EIGEN_GPU_SPARSE_CONTEXT_H
#define EIGEN_GPU_SPARSE_CONTEXT_H #define EIGEN_GPU_SPARSE_CONTEXT_H
@@ -31,6 +40,57 @@
namespace Eigen { namespace Eigen {
// Forward declarations.
template <typename Scalar_>
class GpuSparseContext;
template <typename Scalar_>
class DeviceSparseView;
/** SpMV expression: DeviceSparseView * DeviceMatrix → SpMVExpr.
* Evaluated by DeviceMatrix::operator=(SpMVExpr). */
template <typename Scalar_>
class SpMVExpr {
public:
using Scalar = Scalar_;
SpMVExpr(const DeviceSparseView<Scalar>& view, const DeviceMatrix<Scalar>& x) : view_(view), x_(x) {}
const DeviceSparseView<Scalar>& view() const { return view_; }
const DeviceMatrix<Scalar>& x() const { return x_; }
private:
const DeviceSparseView<Scalar>& view_;
const DeviceMatrix<Scalar>& x_;
};
/** Device-resident sparse matrix view. Returned by GpuSparseContext::deviceView().
* Lightweight handle referencing the context's cached device data.
*
* \warning One GpuSparseContext caches one sparse matrix at a time.
* Creating a second deviceView on the same context overwrites the first.
* For multiple simultaneous sparse matrices, use separate GpuSparseContext
* instances (they can share a GpuContext for same-stream execution).
*
* Supports `d_y = d_A * d_x` via SpMVExpr. */
template <typename Scalar_>
class DeviceSparseView {
public:
using Scalar = Scalar_;
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
DeviceSparseView(GpuSparseContext<Scalar>& ctx, const SpMat& A) : ctx_(ctx), A_(A) {}
/** SpMV expression: d_A * d_x. Evaluated by DeviceMatrix::operator=. */
SpMVExpr<Scalar> operator*(const DeviceMatrix<Scalar>& x) const { return SpMVExpr<Scalar>(*this, x); }
Index rows() const { return A_.rows(); }
Index cols() const { return A_.cols(); }
const GpuSparseContext<Scalar>& context() const { return ctx_; }
const SpMat& matrix() const { return A_; }
private:
GpuSparseContext<Scalar>& ctx_;
const SpMat& A_;
};
template <typename Scalar_> template <typename Scalar_>
class GpuSparseContext { class GpuSparseContext {
public: public:
@@ -41,22 +101,47 @@ class GpuSparseContext {
using DenseVector = Matrix<Scalar, Dynamic, 1>; using DenseVector = Matrix<Scalar, Dynamic, 1>;
using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>; using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
GpuSparseContext() { /** Standalone: creates own stream and cuSPARSE handle. */
GpuSparseContext() : owns_handle_(true) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
owns_stream_ = true;
EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_)); EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_));
EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_)); EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_));
} }
/** Borrow a GpuContext: shares stream and cuSPARSE handle.
* The GpuContext must outlive this GpuSparseContext. */
explicit GpuSparseContext(GpuContext& ctx)
: stream_(ctx.stream()), handle_(ctx.cusparseHandle()), owns_stream_(false), owns_handle_(false) {}
~GpuSparseContext() { ~GpuSparseContext() {
destroy_descriptors(); destroy_descriptors();
if (handle_) (void)cusparseDestroy(handle_); if (owns_handle_ && handle_) (void)cusparseDestroy(handle_);
if (stream_) (void)cudaStreamDestroy(stream_); if (owns_stream_ && stream_) (void)cudaStreamDestroy(stream_);
} }
GpuSparseContext(const GpuSparseContext&) = delete; GpuSparseContext(const GpuSparseContext&) = delete;
GpuSparseContext& operator=(const GpuSparseContext&) = delete; GpuSparseContext& operator=(const GpuSparseContext&) = delete;
// ---- SpMV: y = A * x ----------------------------------------------------- // ---- Device sparse view (for expression syntax: d_y = d_A * d_x) ----------
/** Upload a sparse matrix to device and return a lightweight view.
* The sparse data is uploaded immediately and cached in this context.
* The returned view can be used for repeated SpMV without re-uploading.
* If the matrix values change, call deviceView() again to re-upload.
*
* \warning One context caches one matrix. Calling deviceView() again
* overwrites the previous upload. For multiple simultaneous matrices,
* use separate GpuSparseContext instances sharing the same GpuContext.
*
* Supports `d_y = d_A * d_x` expression syntax. */
DeviceSparseView<Scalar> deviceView(const SpMat& A) {
eigen_assert(A.isCompressed());
upload_sparse(A);
return DeviceSparseView<Scalar>(*this, A);
}
// ---- SpMV: y = A * x (host vectors) --------------------------------------
/** Compute y = A * x. Returns y as a new dense vector. */ /** Compute y = A * x. Returns y as a new dense vector. */
template <typename InputType, typename Rhs> template <typename InputType, typename Rhs>
@@ -64,34 +149,52 @@ class GpuSparseContext {
const SpMat mat(A.derived()); const SpMat mat(A.derived());
DenseVector y(mat.rows()); DenseVector y(mat.rows());
y.setZero(); y.setZero();
multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE); multiply_host_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
return y; return y;
} }
/** Compute y = alpha * op(A) * x + beta * y (in-place). */ /** Compute y = alpha * op(A) * x + beta * y (in-place, host vectors). */
template <typename InputType, typename Rhs, typename Dest> template <typename InputType, typename Rhs, typename Dest>
void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y, void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y,
Scalar alpha = Scalar(1), Scalar beta = Scalar(0), Scalar alpha = Scalar(1), Scalar beta = Scalar(0),
cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) { cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
const SpMat mat(A.derived()); const SpMat mat(A.derived());
multiply_impl(mat, x.derived(), y.derived(), alpha, beta, op); multiply_host_impl(mat, x.derived(), y.derived(), alpha, beta, op);
} }
// ---- SpMV transpose: y = A^T * x ----------------------------------------- // ---- SpMV: y = A * x (DeviceMatrix, no host roundtrip) -------------------
/** Compute y = A^T * x. Returns y as a new dense vector. */ /** Compute d_y = A * d_x. Device-resident, no host transfer.
* Sparse matrix A is uploaded to device (cached). Dense vectors stay on device. */
template <typename InputType>
void multiply(const SparseMatrixBase<InputType>& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y) {
const SpMat mat(A.derived());
multiply_device_impl(mat, d_x, d_y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
}
/** Compute d_y = alpha * op(A) * d_x + beta * d_y (DeviceMatrix, in-place). */
template <typename InputType>
void multiply(const SparseMatrixBase<InputType>& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y,
Scalar alpha, Scalar beta, cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
const SpMat mat(A.derived());
multiply_device_impl(mat, d_x, d_y, alpha, beta, op);
}
// ---- SpMV transpose -------------------------------------------------------
/** Compute y = A^T * x (host vectors). */
template <typename InputType, typename Rhs> template <typename InputType, typename Rhs>
DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) { DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
const SpMat mat(A.derived()); const SpMat mat(A.derived());
DenseVector y(mat.cols()); DenseVector y(mat.cols());
y.setZero(); y.setZero();
multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE); multiply_host_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
return y; return y;
} }
// ---- SpMM: Y = A * X (multiple RHS) -------------------------------------- // ---- SpMM: Y = A * X (host, multiple RHS) --------------------------------
/** Compute Y = A * X where X is a dense matrix (multiple RHS). Returns Y. */ /** Compute Y = A * X where X is a dense matrix. Returns Y. */
template <typename InputType, typename Rhs> template <typename InputType, typename Rhs>
DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X) { DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X) {
const SpMat mat(A.derived()); const SpMat mat(A.derived());
@@ -114,32 +217,37 @@ class GpuSparseContext {
private: private:
cudaStream_t stream_ = nullptr; cudaStream_t stream_ = nullptr;
cusparseHandle_t handle_ = nullptr; cusparseHandle_t handle_ = nullptr;
bool owns_stream_ = false;
bool owns_handle_ = false;
// Cached device buffers (grow-only). // Cached device buffers for sparse matrix (grow-only).
internal::DeviceBuffer d_outerPtr_; internal::DeviceBuffer d_outerPtr_;
internal::DeviceBuffer d_innerIdx_; internal::DeviceBuffer d_innerIdx_;
internal::DeviceBuffer d_values_; internal::DeviceBuffer d_values_;
internal::DeviceBuffer d_x_;
internal::DeviceBuffer d_y_;
internal::DeviceBuffer d_workspace_;
size_t d_outerPtr_size_ = 0; size_t d_outerPtr_size_ = 0;
size_t d_innerIdx_size_ = 0; size_t d_innerIdx_size_ = 0;
size_t d_values_size_ = 0; size_t d_values_size_ = 0;
// Cached device buffers for host-API dense vectors (grow-only).
internal::DeviceBuffer d_x_;
internal::DeviceBuffer d_y_;
size_t d_x_size_ = 0; size_t d_x_size_ = 0;
size_t d_y_size_ = 0; size_t d_y_size_ = 0;
size_t d_workspace_size_ = 0;
// Cached cuSPARSE descriptors. mutable internal::DeviceBuffer d_workspace_;
mutable size_t d_workspace_size_ = 0;
// Cached cuSPARSE sparse matrix descriptor.
cusparseSpMatDescr_t spmat_desc_ = nullptr; cusparseSpMatDescr_t spmat_desc_ = nullptr;
Index cached_rows_ = -1; Index cached_rows_ = -1;
Index cached_cols_ = -1; Index cached_cols_ = -1;
Index cached_nnz_ = -1; Index cached_nnz_ = -1;
// ---- SpMV implementation -------------------------------------------------- // ---- SpMV with host vectors (upload/download per call) --------------------
template <typename RhsDerived, typename DestDerived> template <typename RhsDerived, typename DestDerived>
void multiply_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta, void multiply_host_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
cusparseOperation_t op) { cusparseOperation_t op) {
eigen_assert(A.isCompressed()); eigen_assert(A.isCompressed());
const Index m = A.rows(); const Index m = A.rows();
@@ -159,16 +267,13 @@ class GpuSparseContext {
return; return;
} }
// Upload sparse matrix to device.
upload_sparse(A); upload_sparse(A);
// Upload x to device.
ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar)); ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar));
const DenseVector x_tmp(x); const DenseVector x_tmp(x);
EIGEN_CUDA_RUNTIME_CHECK( EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x_.ptr, x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_)); cudaMemcpyAsync(d_x_.ptr, x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
// Upload y to device (for beta != 0).
ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar)); ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar));
if (beta != Scalar(0)) { if (beta != Scalar(0)) {
const DenseVector y_tmp(y); const DenseVector y_tmp(y);
@@ -176,27 +281,80 @@ class GpuSparseContext {
cudaMemcpyAsync(d_y_.ptr, y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_)); cudaMemcpyAsync(d_y_.ptr, y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
} }
// Create dense vector descriptors. exec_spmv(x_size, y_size, d_x_.ptr, d_y_.ptr, alpha, beta, op);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(y.data(), d_y_.ptr, y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
}
// ---- SpMV with DeviceMatrix (no host transfer) ----------------------------
// Called by public multiply(A, d_x, d_y) — always re-uploads A.
void multiply_device_impl(const SpMat& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y, Scalar alpha,
Scalar beta, cusparseOperation_t op) {
upload_sparse(A);
spmv_device_exec(d_x, d_y, alpha, beta, op);
}
public:
/** Execute SpMV using the already-uploaded sparse matrix (no re-upload).
* Used by SpMVExpr (d_y = d_A * d_x) for cached deviceView() paths.
* The sparse matrix must have been uploaded via deviceView() or multiply(). */
void spmv_device_exec(const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y, Scalar alpha = Scalar(1),
Scalar beta = Scalar(0), cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) const {
eigen_assert(spmat_desc_ && "sparse matrix not uploaded — call deviceView() or multiply() first");
// cuSPARSE SpMV: y must not alias x (undefined behavior).
eigen_assert(d_x.data() != d_y.data() && "SpMV: output aliases input vector");
const Index m = cached_rows_;
const Index n = cached_cols_;
const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
eigen_assert(d_x.rows() * d_x.cols() == x_size);
if (m == 0 || n == 0 || cached_nnz_ == 0) {
d_y.resize(y_size, 1);
if (beta == Scalar(0)) {
d_y.setZero();
}
return;
}
// Ensure d_y is allocated.
if (d_y.rows() * d_y.cols() != y_size) {
d_y.resize(y_size, 1);
}
// Wait for input data to be ready on this stream.
d_x.waitReady(stream_);
d_y.waitReady(stream_);
exec_spmv(x_size, y_size, const_cast<void*>(static_cast<const void*>(d_x.data())), static_cast<void*>(d_y.data()),
alpha, beta, op);
d_y.recordReady(stream_);
}
private:
// ---- Shared SpMV execution ------------------------------------------------
void exec_spmv(Index x_size, Index y_size, void* d_x_ptr, void* d_y_ptr, Scalar alpha, Scalar beta,
cusparseOperation_t op) const {
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value; constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr; cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr;
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_.ptr, dtype)); EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_ptr, dtype));
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_.ptr, dtype)); EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_ptr, dtype));
// Query workspace size.
size_t ws_size = 0; size_t ws_size = 0;
EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype, EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
CUSPARSE_SPMV_ALG_DEFAULT, &ws_size)); CUSPARSE_SPMV_ALG_DEFAULT, &ws_size));
ensure_buffer(d_workspace_, d_workspace_size_, ws_size); ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
// Execute SpMV.
EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype, EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.ptr)); CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.ptr));
// Download result.
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(y.data(), d_y_.ptr, y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
(void)cusparseDestroyDnVec(x_desc); (void)cusparseDestroyDnVec(x_desc);
(void)cusparseDestroyDnVec(y_desc); (void)cusparseDestroyDnVec(y_desc);
} }
@@ -222,7 +380,6 @@ class GpuSparseContext {
upload_sparse(A); upload_sparse(A);
// Upload X to device.
const size_t x_bytes = static_cast<size_t>(k) * static_cast<size_t>(n) * sizeof(Scalar); const size_t x_bytes = static_cast<size_t>(k) * static_cast<size_t>(n) * sizeof(Scalar);
const size_t y_bytes = static_cast<size_t>(m) * static_cast<size_t>(n) * sizeof(Scalar); const size_t y_bytes = static_cast<size_t>(m) * static_cast<size_t>(n) * sizeof(Scalar);
ensure_buffer(d_x_, d_x_size_, x_bytes); ensure_buffer(d_x_, d_x_size_, x_bytes);
@@ -232,24 +389,19 @@ class GpuSparseContext {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.ptr, Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_)); EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.ptr, Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_));
} }
// Create dense matrix descriptors.
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value; constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr; cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr;
// Eigen is column-major, so ld = rows.
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k, n, k, d_x_.ptr, dtype, CUSPARSE_ORDER_COL)); EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k, n, k, d_x_.ptr, dtype, CUSPARSE_ORDER_COL));
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m, n, m, d_y_.ptr, dtype, CUSPARSE_ORDER_COL)); EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m, n, m, d_y_.ptr, dtype, CUSPARSE_ORDER_COL));
// Query workspace.
size_t ws_size = 0; size_t ws_size = 0;
EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_,
x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size)); x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size));
ensure_buffer(d_workspace_, d_workspace_size_, ws_size); ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
// Execute SpMM.
EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta, EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta,
y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.ptr)); y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.ptr));
// Download result.
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.ptr, y_bytes, cudaMemcpyDeviceToHost, stream_)); EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.ptr, y_bytes, cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
@@ -278,21 +430,18 @@ class GpuSparseContext {
cudaMemcpyAsync(d_innerIdx_.ptr, A.innerIndexPtr(), inner_bytes, cudaMemcpyHostToDevice, stream_)); cudaMemcpyAsync(d_innerIdx_.ptr, A.innerIndexPtr(), inner_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, A.valuePtr(), val_bytes, cudaMemcpyHostToDevice, stream_)); EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, A.valuePtr(), val_bytes, cudaMemcpyHostToDevice, stream_));
// Recreate descriptor if shape changed.
if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) { if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) {
destroy_descriptors(); destroy_descriptors();
constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I; constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value; constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value;
// ColMajor → CSC. outerIndexPtr = col offsets, innerIndexPtr = row indices.
EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr, EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr,
idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO, val_type)); idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO, val_type));
cached_rows_ = m; cached_rows_ = m;
cached_cols_ = n; cached_cols_ = n;
cached_nnz_ = nnz; cached_nnz_ = nnz;
} else { } else {
// Same shape — just update pointers.
EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr)); EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr));
} }
} }
@@ -307,7 +456,7 @@ class GpuSparseContext {
cached_nnz_ = -1; cached_nnz_ = -1;
} }
void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) { void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) const {
if (needed > current_size) { if (needed > current_size) {
if (buf.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); if (buf.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
buf = internal::DeviceBuffer(needed); buf = internal::DeviceBuffer(needed);
@@ -316,6 +465,17 @@ class GpuSparseContext {
} }
}; };
// ---- DeviceMatrix::operator=(SpMVExpr) out-of-line definition ----------------
// Defined here because it needs the full GpuSparseContext definition.
template <typename Scalar_>
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const SpMVExpr<Scalar_>& expr) {
// Use spmv_device_exec — the sparse matrix was already uploaded by deviceView().
// No re-upload on repeated SpMV with the same view.
expr.view().context().spmv_device_exec(expr.x(), *this, Scalar_(1), Scalar_(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
return *this;
}
} // namespace Eigen } // namespace Eigen
#endif // EIGEN_GPU_SPARSE_CONTEXT_H #endif // EIGEN_GPU_SPARSE_CONTEXT_H

View File

@@ -21,6 +21,7 @@
#include "./InternalHeaderCheck.h" #include "./InternalHeaderCheck.h"
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <vector>
namespace Eigen { namespace Eigen {
namespace internal { namespace internal {
@@ -36,26 +37,99 @@ namespace internal {
// ---- RAII: device buffer ---------------------------------------------------- // ---- RAII: device buffer ----------------------------------------------------
// Thread-local pool of small device buffers to avoid cudaMalloc/cudaFree
// overhead for tiny allocations (e.g., DeviceScalar). Buffers up to
// kSmallBufferThreshold bytes are recycled; larger allocations bypass the pool.
template <size_t SmallBufferThreshold = 256, size_t MaxPoolSize = 64>
struct DeviceBufferPool {
static constexpr size_t kSmallBufferThreshold = SmallBufferThreshold;
static constexpr size_t kMaxPoolSize = MaxPoolSize;
struct Entry {
void* ptr;
size_t bytes;
};
~DeviceBufferPool() {
for (auto& e : free_list_) (void)cudaFree(e.ptr);
}
void* allocate(size_t bytes) {
// Search for a buffer of sufficient size.
for (size_t i = 0; i < free_list_.size(); ++i) {
if (free_list_[i].bytes >= bytes) {
void* p = free_list_[i].ptr;
free_list_[i] = free_list_.back();
free_list_.pop_back();
return p;
}
}
// No suitable buffer found — allocate new.
void* p = nullptr;
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
return p;
}
void deallocate(void* p, size_t bytes) {
if (free_list_.size() < kMaxPoolSize) {
free_list_.push_back({p, bytes});
} else {
(void)cudaFree(p);
}
}
static DeviceBufferPool& threadLocal() {
thread_local DeviceBufferPool pool;
return pool;
}
private:
std::vector<Entry> free_list_;
};
struct DeviceBuffer { struct DeviceBuffer {
void* ptr = nullptr; void* ptr = nullptr;
DeviceBuffer() = default; DeviceBuffer() = default;
explicit DeviceBuffer(size_t bytes) { explicit DeviceBuffer(size_t bytes) : size_(bytes) {
if (bytes > 0) EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes)); if (bytes > 0) {
if (bytes <= DeviceBufferPool<>::kSmallBufferThreshold) {
ptr = DeviceBufferPool<>::threadLocal().allocate(bytes);
} else {
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes));
}
}
} }
~DeviceBuffer() { ~DeviceBuffer() {
if (ptr) (void)cudaFree(ptr); // destructor: ignore errors if (ptr) {
if (size_ <= DeviceBufferPool<>::kSmallBufferThreshold) {
DeviceBufferPool<>::threadLocal().deallocate(ptr, size_);
} else {
(void)cudaFree(ptr);
}
}
} }
// Move-only. // Move-only.
DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr) { o.ptr = nullptr; } DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr), size_(o.size_) {
o.ptr = nullptr;
o.size_ = 0;
}
DeviceBuffer& operator=(DeviceBuffer&& o) noexcept { DeviceBuffer& operator=(DeviceBuffer&& o) noexcept {
if (this != &o) { if (this != &o) {
if (ptr) (void)cudaFree(ptr); if (ptr) {
if (size_ <= DeviceBufferPool<>::kSmallBufferThreshold) {
DeviceBufferPool<>::threadLocal().deallocate(ptr, size_);
} else {
(void)cudaFree(ptr);
}
}
ptr = o.ptr; ptr = o.ptr;
size_ = o.size_;
o.ptr = nullptr; o.ptr = nullptr;
o.size_ = 0;
} }
return *this; return *this;
} }
@@ -63,12 +137,19 @@ struct DeviceBuffer {
DeviceBuffer(const DeviceBuffer&) = delete; DeviceBuffer(const DeviceBuffer&) = delete;
DeviceBuffer& operator=(const DeviceBuffer&) = delete; DeviceBuffer& operator=(const DeviceBuffer&) = delete;
size_t size() const { return size_; }
// Adopt an existing device pointer. Caller relinquishes ownership. // Adopt an existing device pointer. Caller relinquishes ownership.
// Adopted buffers bypass the pool on destruction.
static DeviceBuffer adopt(void* p) { static DeviceBuffer adopt(void* p) {
DeviceBuffer b; DeviceBuffer b;
b.ptr = p; b.ptr = p;
b.size_ = DeviceBufferPool<>::kSmallBufferThreshold + 1; // force cudaFree
return b; return b;
} }
private:
size_t size_ = 0;
}; };
// ---- Scalar → cudaDataType_t ------------------------------------------------ // ---- Scalar → cudaDataType_t ------------------------------------------------

View File

@@ -62,6 +62,23 @@ The GPU version reads like CPU Eigen with explicit upload/download for dense
operations, and an almost identical API for sparse solvers. Unsupported operations, and an almost identical API for sparse solvers. Unsupported
expressions are compile errors. expressions are compile errors.
**Standalone module.** `Eigen/GPU` does not modify or depend on Eigen's Core
expression template system (`MatrixBase`, `CwiseBinaryOp`, etc.).
`DeviceMatrix` is not an Eigen expression type and does not inherit from
`MatrixBase`. The expression layer is a thin compile-time dispatch where every
supported expression maps to a single NVIDIA library call. There is no
coefficient-level evaluation, lazy fusion, or packet operations.
**Interoperability where useful.** `DeviceMatrix` provides the same operator
signatures as `Matrix` for common vector operations: `+=`, `-=`, `*=`,
`dot()`, `squaredNorm()`, `norm()`, `setZero()`, and `noalias()`. This makes
`DeviceMatrix` usable as a drop-in `VectorType` in Eigen algorithm templates
that rely on these operations. For example, Eigen's `conjugate_gradient()`
template works with `DeviceMatrix` with a single typedef change -- no
modifications to the algorithm or the expression template system. Conjugate
gradient is just the motivating example; we are open to expanding operator
coverage as needed to support other high-level Eigen algorithms on the GPU.
**Explicit over implicit.** Host-device transfers, stream management, and **Explicit over implicit.** Host-device transfers, stream management, and
library handle lifetimes are visible in the API. There are no hidden library handle lifetimes are visible in the API. There are no hidden
allocations or synchronizations except where documented (e.g., `toHost()` must allocations or synchronizations except where documented (e.g., `toHost()` must
@@ -96,6 +113,27 @@ MatrixXd C = transfer.get();
`selfadjointView<UpLo>()`, `llt()`, `lu()`. These return lightweight `selfadjointView<UpLo>()`, `llt()`, `lu()`. These return lightweight
expression objects that are evaluated when assigned. expression objects that are evaluated when assigned.
For BLAS Level-1 operations, `DeviceMatrix` also provides `dot()`, `norm()`,
`squaredNorm()`, `setZero()`, `noalias()`, and arithmetic operators
(`+=`, `-=`, `*=`) that dispatch to cuBLAS `axpy`, `nrm2`, `dot`, and
`geam`. These are the operations needed by iterative solvers.
### `DeviceScalar<Scalar>`
A device-resident scalar value. Reductions like `dot()`, `norm()`, and
`squaredNorm()` return `DeviceScalar` instead of a host scalar, deferring
the host synchronization until the value is actually needed:
```cpp
auto dot_val = d_x.dot(d_y); // DeviceScalar -- no sync
auto norm_sq = d_r.squaredNorm(); // DeviceScalar -- no sync
Scalar alpha = dot_val / norm_sq; // sync here (implicit conversion)
d_x += alpha * d_p; // host scalar * DeviceMatrix (axpy)
```
Division between `DeviceScalar` values (real types only) is performed on
device via NPP, avoiding extra synchronizations.
### `GpuContext` ### `GpuContext`
Every GPU operation needs a CUDA stream and library handles (cuBLAS, Every GPU operation needs a CUDA stream and library handles (cuBLAS,
@@ -118,6 +156,12 @@ d_C1.device(ctx1) = d_A1 * d_B1; // runs on stream 1
d_C2.device(ctx2) = d_A2 * d_B2; // runs on stream 2 (concurrently) d_C2.device(ctx2) = d_A2 * d_B2; // runs on stream 2 (concurrently)
``` ```
To integrate with existing CUDA code, borrow an existing stream:
```cpp
GpuContext ctx(my_existing_stream); // wraps stream, does not take ownership
```
## Usage ## Usage
### Matrix operations (cuBLAS) ### Matrix operations (cuBLAS)
@@ -133,7 +177,7 @@ d_C = d_A * d_B.transpose();
// Scaled and accumulated // Scaled and accumulated
d_C += 2.0 * d_A * d_B; // alpha=2, beta=1 d_C += 2.0 * d_A * d_B; // alpha=2, beta=1
d_C.device(ctx) -= d_A * d_B; // alpha=-1, beta=1 (requires explicit context) d_C.device(ctx) -= d_A * d_B; // alpha=-1, beta=1 (GEMM requires explicit context for -=)
// Triangular solve (TRSM) // Triangular solve (TRSM)
d_X = d_A.triangularView<Lower>().solve(d_B); d_X = d_A.triangularView<Lower>().solve(d_B);
@@ -145,6 +189,30 @@ d_C = d_A.selfadjointView<Lower>() * d_B;
d_C.selfadjointView<Lower>().rankUpdate(d_A); // C += A * A^H d_C.selfadjointView<Lower>().rankUpdate(d_A); // C += A * A^H
``` ```
### BLAS Level-1 operations
```cpp
// Dot product and norms (return DeviceScalar -- no sync until read)
auto dot_val = d_x.dot(d_y); // cublasDdot / cublasCdotc
auto norm_val = d_r.norm(); // cublasDnrm2
double n = norm_val; // implicit conversion triggers sync
// Vector arithmetic (cuBLAS axpy / geam)
d_x += alpha * d_p; // axpy: x = x + alpha * p
d_x -= alpha * d_p; // axpy: x = x - alpha * p
d_x *= alpha; // scal: x = alpha * x
d_r.setZero(); // cudaMemsetAsync
// DeviceScalar arithmetic (stays on device, real types only)
auto alpha = absNew / dot_val; // device-side division via NPP
d_x += alpha * d_p; // DeviceScalar * DeviceMatrix (axpy with device pointer)
// Matrix add/subtract (cuBLAS geam)
DeviceMatrix<double> d_C = d_A + d_B; // C = A + B
d_C = d_A + 2.0 * d_B; // C = A + 2*B
d_C = d_A - d_B; // C = A - B
```
### Dense solvers (cuSOLVER) ### Dense solvers (cuSOLVER)
**One-shot expression syntax** -- Convenient, re-factorizes each time: **One-shot expression syntax** -- Convenient, re-factorizes each time:
@@ -183,6 +251,7 @@ GpuSVD<double> svd;
svd.compute(d_A, ComputeThinU | ComputeThinV); svd.compute(d_A, ComputeThinU | ComputeThinV);
VectorXd S = svd.singularValues(); // downloads to host VectorXd S = svd.singularValues(); // downloads to host
MatrixXd U = svd.matrixU(); // downloads to host MatrixXd U = svd.matrixU(); // downloads to host
MatrixXd V = svd.matrixV(); // V (matches JacobiSVD)
MatrixXd VT = svd.matrixVT(); // V^T (matches cuSOLVER) MatrixXd VT = svd.matrixVT(); // V^T (matches cuSOLVER)
// Self-adjoint eigenvalue decomposition (results downloaded on access) // Self-adjoint eigenvalue decomposition (results downloaded on access)
@@ -253,21 +322,60 @@ MatrixXcf C = fft.inv2d(B); // 2D inverse (scaled by 1/(rows*cols))
SparseMatrix<double> A = ...; SparseMatrix<double> A = ...;
VectorXd x = ...; VectorXd x = ...;
GpuSparseContext<double> ctx; // Host vectors (upload/download handled internally)
VectorXd y = ctx.multiply(A, x); // y = A * x GpuSparseContext<double> spmv;
VectorXd z = ctx.multiplyT(A, x); // z = A^T * x VectorXd y = spmv.multiply(A, x); // y = A * x
ctx.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y VectorXd z = spmv.multiplyT(A, x); // z = A^T * x
spmv.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y
MatrixXd Y = spmv.multiplyMat(A, X); // Y = A * X (SpMM)
// Multiple RHS (SpMM) // Device-resident SpMV (sparse matrix cached on device)
MatrixXd Y = ctx.multiplyMat(A, X); // Y = A * X GpuSparseContext<double> spmv(ctx); // share GpuContext for same-stream
auto d_A = spmv.deviceView(A); // upload sparse matrix once
d_y = d_A * d_x; // operator syntax, stays on device
```
### Eigen algorithm interop (example: Conjugate gradient)
The BLAS-1 operators and `DeviceSparseView` make `DeviceMatrix` usable as a
vector type in GPU implementations of algorithms like conjugate gradient.
Conjugate gradient is the motivating example -- a GPU CG implementation
uses the same operations as the CPU version:
```cpp
GpuContext ctx;
GpuSparseContext<double> spmv(ctx);
auto d_A = spmv.deviceView(A); // sparse matrix on device
auto d_b = DeviceMatrix<double>::fromHost(b);
auto d_x = DeviceMatrix<double>::fromHost(x0);
// CG iteration using DeviceMatrix operators
DeviceMatrix<double> d_r = d_b; // r = b (deep copy via geam)
DeviceMatrix<double> d_p(n), d_tmp(n);
d_tmp = d_A * d_x; // SpMV (device-resident)
d_r -= d_tmp; // axpy
d_p = d_r.clone();
RealScalar absNew = d_r.squaredNorm(); // DeviceScalar -> implicit sync
for (int i = 0; i < maxIters && absNew > tol * tol; ++i) {
d_tmp = d_A * d_p; // SpMV
auto alpha = absNew / d_p.dot(d_tmp); // host / DeviceScalar -> DeviceScalar
d_x += alpha * d_p; // axpy with DeviceScalar
d_r -= alpha * d_tmp; // axpy with DeviceScalar
RealScalar absOld = absNew;
absNew = d_r.squaredNorm(); // DeviceScalar -> implicit sync
d_p *= Scalar(absNew / absOld); // scal (host scalars)
d_p += d_r; // axpy
}
MatrixXd x = d_x.toHost();
``` ```
### Precision control ### Precision control
GEMM dispatch enables tensor core algorithms by default, allowing cuBLAS to GEMM dispatch uses `cublasLtMatmul` with heuristic algorithm selection,
choose the fastest algorithm for the given precision and architecture. For enabling cuBLAS to choose tensor core algorithms when beneficial. For double
double precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64 precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64 results
results computed faster via tensor cores. computed faster via tensor cores.
| Macro | Effect | | Macro | Effect |
|---|---| |---|---|
@@ -290,6 +398,7 @@ Mandatory sync points:
- `fromHost()` -- Synchronizes to complete the upload before returning - `fromHost()` -- Synchronizes to complete the upload before returning
- `toHost()` / `HostTransfer::get()` -- Must deliver data to host - `toHost()` / `HostTransfer::get()` -- Must deliver data to host
- `info()` -- Must read the factorization status - `info()` -- Must read the factorization status
- `DeviceScalar` implicit conversion -- Downloads scalar from device
**Cross-stream safety** is automatic. `DeviceMatrix` tracks write completion **Cross-stream safety** is automatic. `DeviceMatrix` tracks write completion
via CUDA events. When a matrix written on stream A is read on stream B, the via CUDA events. When a matrix written on stream A is read on stream B, the
@@ -307,21 +416,33 @@ noted otherwise).
| DeviceMatrix expression | Library call | Parameters | | DeviceMatrix expression | Library call | Parameters |
|---|---|---| |---|---|---|
| `C = A * B` | `cublasGemmEx` | transA=N, transB=N, alpha=1, beta=0 | | `C = A * B` | `cublasLtMatmul` | transA=N, transB=N, alpha=1, beta=0 |
| `C = A.adjoint() * B` | `cublasGemmEx` | transA=C, transB=N | | `C = A.adjoint() * B` | `cublasLtMatmul` | transA=C, transB=N |
| `C = A.transpose() * B` | `cublasGemmEx` | transA=T, transB=N | | `C = A.transpose() * B` | `cublasLtMatmul` | transA=T, transB=N |
| `C = A * B.adjoint()` | `cublasGemmEx` | transA=N, transB=C | | `C = A * B.adjoint()` | `cublasLtMatmul` | transA=N, transB=C |
| `C = A * B.transpose()` | `cublasGemmEx` | transA=N, transB=T | | `C = A * B.transpose()` | `cublasLtMatmul` | transA=N, transB=T |
| `C = alpha * A * B` | `cublasGemmEx` | alpha from LHS | | `C = alpha * A * B` | `cublasLtMatmul` | alpha from LHS |
| `C = A * (alpha * B)` | `cublasGemmEx` | alpha from RHS | | `C = A * (alpha * B)` | `cublasLtMatmul` | alpha from RHS |
| `C += A * B` | `cublasGemmEx` | alpha=1, beta=1 | | `C += A * B` | `cublasLtMatmul` | alpha=1, beta=1 |
| `C.device(ctx) -= A * B` | `cublasGemmEx` | alpha=-1, beta=1 | | `C.device(ctx) -= A * B` | `cublasLtMatmul` | alpha=-1, beta=1 |
| `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs | | `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs |
| `X = A.llt<Upper>().solve(B)` | same | uplo=Upper | | `X = A.llt<Upper>().solve(B)` | same | uplo=Upper |
| `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs | | `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs |
| `X = A.triangularView<L>().solve(B)` | `cublasXtrsm` | side=L, uplo, diag=NonUnit | | `X = A.triangularView<L>().solve(B)` | `cublasXtrsm` | side=L, uplo, diag=NonUnit |
| `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo | | `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
| `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N | | `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
| `C = A + B` | `cublasXgeam` | alpha=1, beta=1 |
| `C = A + alpha * B` | `cublasXgeam` | alpha=1, beta from scaled |
| `C = A - B` | `cublasXgeam` | alpha=1, beta=-1 |
| `C = A - alpha * B` | `cublasXgeam` | alpha=1, beta=-scaled |
| `x += alpha * y` | `cublasXaxpy` | alpha (host scalar) |
| `x += dAlpha * y` | `cublasXaxpy` | alpha (DeviceScalar, device pointer mode) |
| `x -= alpha * y` | `cublasXaxpy` | alpha negated |
| `x *= alpha` | `cublasXscal` | alpha (host or DeviceScalar) |
| `x.dot(y)` | `cublasXdot` / `cublasXdotc` | returns `DeviceScalar` |
| `x.norm()` | `cublasXnrm2` | returns `DeviceScalar<RealScalar>` |
| `x.squaredNorm()` | `cublasXdot(x, x)` | returns `DeviceScalar<RealScalar>` |
| `d_y = view * d_x` | `cusparseSpMV` | device-resident SpMV |
### `DeviceMatrix<Scalar>` ### `DeviceMatrix<Scalar>`
@@ -332,11 +453,12 @@ one column.
```cpp ```cpp
// Construction // Construction
DeviceMatrix<Scalar>() // Empty (0x0) DeviceMatrix<Scalar>() // Empty (0x0)
DeviceMatrix<Scalar>(Index n) // Allocate column vector (n x 1)
DeviceMatrix<Scalar>(rows, cols) // Allocate uninitialized DeviceMatrix<Scalar>(rows, cols) // Allocate uninitialized
// Upload / download // Upload / download
static DeviceMatrix fromHost(matrix, stream=nullptr) // -> DeviceMatrix (syncs) static DeviceMatrix fromHost(matrix, stream=nullptr) // -> DeviceMatrix (syncs)
static DeviceMatrix fromHostAsync(ptr, rows, cols, outerStride, s) // -> DeviceMatrix (no sync, caller manages ptr lifetime) static DeviceMatrix fromHostAsync(ptr, rows, cols, stream) // -> DeviceMatrix (no sync, caller manages ptr lifetime)
PlainMatrix toHost(stream=nullptr) // -> host Matrix (syncs) PlainMatrix toHost(stream=nullptr) // -> host Matrix (syncs)
HostTransfer toHostAsync(stream=nullptr) // -> HostTransfer future (no sync) HostTransfer toHostAsync(stream=nullptr) // -> HostTransfer future (no sync)
DeviceMatrix clone(stream=nullptr) // -> DeviceMatrix (D2D copy, async) DeviceMatrix clone(stream=nullptr) // -> DeviceMatrix (D2D copy, async)
@@ -357,6 +479,50 @@ LuExpr lu() // -> .solve(d_B) -> De
TriangularView triangularView<UpLo>() // -> .solve(d_B) -> DeviceMatrix (TRSM) TriangularView triangularView<UpLo>() // -> .solve(d_B) -> DeviceMatrix (TRSM)
SelfAdjointView selfadjointView<UpLo>() // -> * d_B (SYMM), .rankUpdate(d_A) (SYRK) SelfAdjointView selfadjointView<UpLo>() // -> * d_B (SYMM), .rankUpdate(d_A) (SYRK)
DeviceAssignment device(GpuContext& ctx) // Bind assignment to explicit stream DeviceAssignment device(GpuContext& ctx) // Bind assignment to explicit stream
DeviceMatrix& noalias() // No-op (all ops are implicitly noalias)
// BLAS Level-1 (all have overloads with explicit GpuContext& parameter)
DeviceScalar<Scalar> dot(const DeviceMatrix& other) // cuBLAS dot/dotc -> DeviceScalar
DeviceScalar<RealScalar> norm() // cuBLAS nrm2 -> DeviceScalar
DeviceScalar<RealScalar> squaredNorm() // dot(self, self) -> DeviceScalar (no sync)
void setZero() // cudaMemsetAsync
void addScaled(GpuContext&, Scalar alpha, const DeviceMatrix& x) // this += alpha * x (axpy)
void scale(GpuContext&, Scalar alpha) // this *= alpha (scal)
void copyFrom(GpuContext&, const DeviceMatrix& other) // this = other (D2D copy)
DeviceMatrix& operator+=(Scalar * DeviceMatrix) // cuBLAS axpy
DeviceMatrix& operator-=(Scalar * DeviceMatrix) // cuBLAS axpy (negated)
DeviceMatrix& operator+=(const DeviceMatrix&) // cuBLAS axpy
DeviceMatrix& operator-=(const DeviceMatrix&) // cuBLAS axpy
DeviceMatrix& operator+=(const DeviceScaledDevice&) // cuBLAS axpy (DeviceScalar * DeviceMatrix)
DeviceMatrix& operator-=(const DeviceScaledDevice&) // cuBLAS axpy (DeviceScalar * DeviceMatrix, negated)
DeviceMatrix& operator*=(Scalar) // cuBLAS scal
DeviceMatrix& operator*=(const DeviceScalar<Scalar>&) // cuBLAS scal (device pointer)
DeviceMatrix cwiseProduct(GpuContext&, const DeviceMatrix&) // NPP nppsMul (float/double only)
void cwiseProduct(GpuContext&, const DeviceMatrix&, const DeviceMatrix&) // in-place: this = a .* b
// geam expressions (evaluated on assignment)
DeviceMatrix& operator=(const DeviceAddExpr&) // C = A + B, C = A + alpha*B, C = A - B, etc.
```
### `DeviceScalar<Scalar>`
Device-resident scalar. Returned by `dot()`, `norm()`, and `squaredNorm()`.
Implicit conversion to `Scalar` triggers `cudaStreamSynchronize` + download.
```cpp
DeviceScalar(cudaStream_t stream = nullptr) // Allocate uninitialized
DeviceScalar(Scalar host_val, cudaStream_t stream) // Upload host value
Scalar get() // Download (syncs stream)
operator Scalar() // Implicit conversion (syncs)
Scalar* devicePtr() // Raw device pointer
cudaStream_t stream()
// Device-side arithmetic (no host sync, real types only)
DeviceScalar operator/(DeviceScalar, DeviceScalar) // NPP nppsDiv
DeviceScalar operator/(Scalar, DeviceScalar) // upload + div
DeviceScalar operator/(DeviceScalar, Scalar) // upload + div
DeviceScalar operator-() // NPP nppsMulC(-1)
``` ```
### `GpuContext` ### `GpuContext`
@@ -365,11 +531,15 @@ Unified GPU execution context owning a CUDA stream and library handles.
```cpp ```cpp
GpuContext() // Creates dedicated stream + handles GpuContext() // Creates dedicated stream + handles
GpuContext(cudaStream_t stream) // Borrow existing stream (not owned)
static GpuContext& threadLocal() // Per-thread default (lazy-created) static GpuContext& threadLocal() // Per-thread default (lazy-created)
static void setThreadLocal(GpuContext* ctx) // Override thread-local default (nullptr restores)
cudaStream_t stream() cudaStream_t stream()
cublasHandle_t cublasHandle() cublasHandle_t cublasHandle()
cusolverDnHandle_t cusolverHandle() cusolverDnHandle_t cusolverHandle()
cublasLtHandle_t cublasLtHandle() // Lazy-initialized
cusparseHandle_t cusparseHandle() // Lazy-initialized
``` ```
Non-copyable, non-movable (owns library handles). Non-copyable, non-movable (owns library handles).
@@ -440,6 +610,7 @@ GpuSVD& compute(const DeviceMatrix& d_A, unsigned options = ComputeTh
RealVector singularValues() // -> host vector (syncs, downloads) RealVector singularValues() // -> host vector (syncs, downloads)
PlainMatrix matrixU() // -> host Matrix (syncs, downloads) PlainMatrix matrixU() // -> host Matrix (syncs, downloads)
PlainMatrix matrixV() // -> host Matrix (V = VT^H, matches JacobiSVD)
PlainMatrix matrixVT() // -> host Matrix (syncs, downloads V^T) PlainMatrix matrixVT() // -> host Matrix (syncs, downloads V^T)
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (pseudoinverse) PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (pseudoinverse)
@@ -452,9 +623,9 @@ Index rows() / cols()
cudaStream_t stream() cudaStream_t stream()
``` ```
**Note:** `singularValues()`, `matrixU()`, and `matrixVT()` download to host **Note:** `singularValues()`, `matrixU()`, `matrixV()`, and `matrixVT()`
on each call. Device-side accessors returning `DeviceMatrix` are planned but download to host on each call. Device-side accessors returning `DeviceMatrix`
not yet implemented. are planned but not yet implemented.
### `GpuSelfAdjointEigenSolver<Scalar>` -- Eigendecomposition (cuSOLVER) ### `GpuSelfAdjointEigenSolver<Scalar>` -- Eigendecomposition (cuSOLVER)
@@ -544,28 +715,47 @@ the input scalar type (complex vs real).
### `GpuSparseContext<Scalar>` -- SpMV/SpMM (cuSPARSE) ### `GpuSparseContext<Scalar>` -- SpMV/SpMM (cuSPARSE)
Accepts `SparseMatrix<Scalar, ColMajor>`. All methods accept host data and Accepts `SparseMatrix<Scalar, ColMajor>`.
return host data.
```cpp ```cpp
GpuSparseContext() // Creates own stream + cuSPARSE handle GpuSparseContext() // Creates own stream + cuSPARSE handle
GpuSparseContext(GpuContext& ctx) // Borrow GpuContext for same-stream execution
DenseVector multiply(A, x) // y = A * x // Host data in/out
void multiply(A, x, y, alpha=1, beta=0, // y = alpha*op(A)*x + beta*y DenseVector multiply(A, x) // y = A * x
void multiply(A, x, y, alpha=1, beta=0, // y = alpha*op(A)*x + beta*y
op=CUSPARSE_OPERATION_NON_TRANSPOSE) op=CUSPARSE_OPERATION_NON_TRANSPOSE)
DenseVector multiplyT(A, x) // y = A^T * x DenseVector multiplyT(A, x) // y = A^T * x
DenseMatrix multiplyMat(A, X) // Y = A * X (SpMM) DenseMatrix multiplyMat(A, X) // Y = A * X (SpMM)
// DeviceMatrix in/out (sparse matrix re-uploaded each call)
void multiply(A, d_x, d_y) // SpMV with device vectors
void multiply(A, d_x, d_y, alpha, beta, op)
// Device-resident sparse matrix (upload once, reuse)
DeviceSparseView deviceView(A) // Upload sparse matrix, return view
cudaStream_t stream() cudaStream_t stream()
``` ```
### `DeviceSparseView<Scalar>` -- Device-resident sparse matrix
Returned by `GpuSparseContext::deviceView()`. Holds a sparse matrix on device
for repeated SpMV without re-uploading.
```cpp
SpMVExpr operator*(const DeviceMatrix& d_x) // d_y = view * d_x (evaluated on assignment)
```
### Aliasing ### Aliasing
Unlike Eigen's `Matrix`, where omitting `.noalias()` triggers a copy to a Unlike Eigen's `Matrix`, where omitting `.noalias()` triggers a copy to a
temporary, DeviceMatrix dispatches directly to NVIDIA library calls which have temporary, DeviceMatrix dispatches directly to NVIDIA library calls which have
no built-in aliasing protection. All operations are implicitly noalias. no built-in aliasing protection. All operations are implicitly noalias.
The caller must ensure operands don't alias the destination for GEMM and TRSM The caller must ensure operands don't alias the destination for GEMM and TRSM
(debug asserts catch violations). (debug asserts catch violations). `geam` expressions (`d_C = d_A + alpha * d_B`)
are safe with aliasing. The `.noalias()` method exists as a no-op for Eigen
template compatibility.
## File layout ## File layout
@@ -573,12 +763,14 @@ The caller must ensure operands don't alias the destination for GEMM and TRSM
|------|-----------|----------| |------|-----------|----------|
| `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `cuda_data_type<>` | | `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `cuda_data_type<>` |
| `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` | | `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` |
| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM expression wrappers | | `DeviceExpr.h` | `DeviceMatrix.h` | GEMM and geam expression wrappers |
| `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers | | `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers |
| `DeviceSolverExpr.h` | `DeviceMatrix.h` | Solver expression wrappers (LLT, LU) | | `DeviceSolverExpr.h` | `DeviceMatrix.h` | Solver expression wrappers (LLT, LU) |
| `DeviceScalar.h` | `GpuSupport.h`, `DeviceScalarOps.h` | `DeviceScalar<>` (device-resident scalar) |
| `DeviceScalarOps.h` | `<npps_*.h>` | Scalar div/neg/cwiseProduct via NPP |
| `DeviceDispatch.h` | all above | All dispatch functions + `DeviceAssignment` | | `DeviceDispatch.h` | all above | All dispatch functions + `DeviceAssignment` |
| `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `GpuContext` | | `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `GpuContext` |
| `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>` | cuBLAS error macro, op/compute type maps | | `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>`, `<cublasLt.h>` | cuBLAS/cuBLASLt error macro, type maps |
| `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping | | `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization | | `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization | | `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
@@ -588,7 +780,7 @@ The caller must ensure operands don't alias the destination for GEMM and TRSM
| `CuFftSupport.h` | `GpuSupport.h`, `<cufft.h>` | cuFFT error macro, type-dispatch wrappers | | `CuFftSupport.h` | `GpuSupport.h`, `<cufft.h>` | cuFFT error macro, type-dispatch wrappers |
| `GpuFFT.h` | `CuFftSupport.h`, `CuBlasSupport.h` | 1D/2D FFT with plan caching | | `GpuFFT.h` | `CuFftSupport.h`, `CuBlasSupport.h` | 1D/2D FFT with plan caching |
| `CuSparseSupport.h` | `GpuSupport.h`, `<cusparse.h>` | cuSPARSE error macro | | `CuSparseSupport.h` | `GpuSupport.h`, `<cusparse.h>` | cuSPARSE error macro |
| `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE | | `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE, `DeviceSparseView` |
| `CuDssSupport.h` | `GpuSupport.h`, `<cudss.h>` | cuDSS error macro, type traits (optional) | | `CuDssSupport.h` | `GpuSupport.h`, `<cudss.h>` | cuDSS error macro, type traits (optional) |
| `GpuSparseSolverBase.h` | `CuDssSupport.h` | CRTP base for sparse solvers (optional) | | `GpuSparseSolverBase.h` | `CuDssSupport.h` | CRTP base for sparse solvers (optional) |
| `GpuSparseLLT.h` | `GpuSparseSolverBase.h` | Sparse Cholesky via cuDSS (optional) | | `GpuSparseLLT.h` | `GpuSparseSolverBase.h` | Sparse Cholesky via cuDSS (optional) |
@@ -606,7 +798,7 @@ cmake -G Ninja -B build -S . \
cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu \ cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu \
gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen \ gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen \
gpu_device_matrix gpu_cufft gpu_cusparse_spmv gpu_device_matrix gpu_cufft gpu_cusparse_spmv gpu_cg
ctest --test-dir build -R "gpu_" --output-on-failure ctest --test-dir build -R "gpu_" --output-on-failure
# Sparse solvers (cuDSS -- separate install required) # Sparse solvers (cuDSS -- separate install required)
@@ -627,10 +819,19 @@ ctest --test-dir build -R gpu_cudss --output-on-failure
Device-side accessors returning `DeviceMatrix` views of the internal buffers Device-side accessors returning `DeviceMatrix` views of the internal buffers
would allow chaining GPU operations (e.g., `svd.deviceU() * d_A`) without would allow chaining GPU operations (e.g., `svd.deviceU() * d_A`) without
round-tripping through host memory. round-tripping through host memory.
- **Device-resident sparse matrix-vector products.** `GpuSparseContext` - **Batched API (`DeviceBatchMatrix`).** A strided batch of N identical-size
currently operates on host vectors and matrices, uploading and downloading matrices dispatching to cuBLAS/cuSOLVER batched APIs (`cublasDgemmBatched`,
on each call. The key missing piece is a `DeviceSparseView` that holds a `cusolverDnXpotrfBatched`, etc.). This enables robotics and model-predictive
sparse matrix on device and supports operator syntax (`d_y = d_A * d_x`) control workloads where many small independent systems are solved in
with `DeviceMatrix` operands -- keeping the entire SpMV/SpMM pipeline on parallel.
device. This is essential for iterative solvers and any workflow that chains - **cuTENSOR for Tensor module.** Replace the hand-written GPU tensor
sparse and dense operations without returning to the host. contraction and reduction kernels (~2300 lines in
`TensorContractionGpu.h` / `TensorReductionGpu.h`) with cuTENSOR dispatch,
following the same library-dispatch pattern used by `Eigen/GPU`.
- **Unified/zero-copy memory for Jetson.** Use `cudaMallocManaged` or
`cudaHostAllocMapped` to eliminate `fromHost()` / `toHost()` copies on
integrated GPUs (Jetson) where CPU and GPU share DRAM.
- **Device-side Eigen interop.** Bridge between host-side `DeviceMatrix`
dispatch and device-side Eigen expression templates (Core + Tensor) running
inside CUDA kernels. Raw-pointer + `Map` / `TensorMap` as the zero-copy
interop surface.

View File

@@ -31,7 +31,10 @@ EIGEN_DONT_INLINE void conjugate_gradient(const MatrixType& mat, const Rhs& rhs,
Index& iters, typename Dest::RealScalar& tol_error) { Index& iters, typename Dest::RealScalar& tol_error) {
typedef typename Dest::RealScalar RealScalar; typedef typename Dest::RealScalar RealScalar;
typedef typename Dest::Scalar Scalar; typedef typename Dest::Scalar Scalar;
typedef Matrix<Scalar, Dynamic, 1> VectorType; // Use Dest's plain (owning) type as VectorType. For CPU Matrix/Map this
// resolves to Matrix<Scalar,Dynamic,1>. For GPU DeviceMatrix, PlainObject
// is DeviceMatrix itself (already owning).
typedef typename Dest::PlainObject VectorType;
RealScalar tol = tol_error; RealScalar tol = tol_error;
Index maxIters = iters; Index maxIters = iters;

View File

@@ -11,7 +11,7 @@
# ncu --set full -o profile ./build-bench-gpu/bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096 # ncu --set full -o profile ./build-bench-gpu/bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
cmake_minimum_required(VERSION 3.18) cmake_minimum_required(VERSION 3.18)
project(EigenGpuBenchmarks CXX) project(EigenGpuBenchmarks CXX CUDA)
find_package(benchmark REQUIRED) find_package(benchmark REQUIRED)
find_package(CUDAToolkit REQUIRED) find_package(CUDAToolkit REQUIRED)
@@ -55,3 +55,37 @@ eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITI
# FFT benchmarks: 1D/2D C2C, R2C, C2R throughput and plan reuse. # FFT benchmarks: 1D/2D C2C, R2C, C2R throughput and plan reuse.
eigen_add_gpu_benchmark(bench_gpu_fft bench_gpu_fft.cpp LIBRARIES CUDA::cufft) eigen_add_gpu_benchmark(bench_gpu_fft bench_gpu_fft.cpp LIBRARIES CUDA::cufft)
eigen_add_gpu_benchmark(bench_gpu_fft_double bench_gpu_fft.cpp LIBRARIES CUDA::cufft DEFINITIONS SCALAR=double) eigen_add_gpu_benchmark(bench_gpu_fft_double bench_gpu_fft.cpp LIBRARIES CUDA::cufft DEFINITIONS SCALAR=double)
# CG sync overhead benchmark: host vs device pointer mode for reductions.
# Uses CUDA kernels for device scalar arithmetic.
add_executable(bench_gpu_cg_sync bench_gpu_cg_sync.cu)
target_include_directories(bench_gpu_cg_sync PRIVATE
${EIGEN_SOURCE_DIR}
${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(bench_gpu_cg_sync PRIVATE
benchmark::benchmark benchmark::benchmark_main
CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
target_compile_options(bench_gpu_cg_sync PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
target_compile_definitions(bench_gpu_cg_sync PRIVATE EIGEN_USE_GPU)
# GPU CG vs CPU CG comparison benchmark.
add_executable(bench_gpu_cg_vs_cpu bench_gpu_cg_vs_cpu.cu)
target_include_directories(bench_gpu_cg_vs_cpu PRIVATE
${EIGEN_SOURCE_DIR}
${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(bench_gpu_cg_vs_cpu PRIVATE
benchmark::benchmark benchmark::benchmark_main
CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
target_compile_options(bench_gpu_cg_vs_cpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
target_compile_definitions(bench_gpu_cg_vs_cpu PRIVATE EIGEN_USE_GPU)
# Bundle Adjustment benchmark: GPU CG vs CPU CG on real BAL datasets.
add_executable(bench_gpu_ba bench_gpu_ba.cu)
target_include_directories(bench_gpu_ba PRIVATE
${EIGEN_SOURCE_DIR}
${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(bench_gpu_ba PRIVATE
benchmark::benchmark
CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
target_compile_options(bench_gpu_ba PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
target_compile_definitions(bench_gpu_ba PRIVATE EIGEN_USE_GPU)

View File

@@ -0,0 +1,149 @@
# Bundle Adjustment: GPU CG vs CPU CG Results
Benchmark of Eigen's GPU CG pipeline on normal equations arising from bundle
adjustment (BAL datasets). Compares CPU `ConjugateGradient` (Jacobi preconditioner)
against GPU CG using `DeviceMatrix` + `GpuSparseContext` + `DeviceScalar`.
## Hardware
- **CPU**: Intel Core i7-13700HX (Raptor Lake, 12 cores / 24 threads, single thread for Eigen CG)
- **GPU**: NVIDIA GeForce RTX 4070 Laptop GPU (Ada Lovelace, 4608 CUDA cores, 8 GB GDDR6)
- **CUDA**: 13.2 / Driver 595.79
- **OS**: Ubuntu 24.04 (WSL2, kernel 6.6.87)
## Software
- Eigen: `eigen-gpu-cg` branch
- Google Benchmark 1.9.1
- Compiler: nvcc 13.2 + g++ 13.3
- Normal equations: H = J^T*J + I (Levenberg-Marquardt damping lambda=1.0)
- CG tolerance: 1e-8, max iterations: 10000
## Method
For each BAL problem file:
1. Parse the BAL file (cameras, 3D points, 2D observations)
2. Compute the full Jacobian J using the BAL camera model (Rodrigues rotation +
perspective projection + radial distortion) with central finite differences
3. Form the normal equations H = J^T*J + lambda*I (sparse, symmetric positive definite)
4. Solve H*dx = -J^T*r using CG with Jacobi preconditioner on CPU and GPU
5. Report wall-clock time (mean of 3 repetitions)
GPU CG uses: `GpuSparseContext` for SpMV, `DeviceMatrix` for vectors,
`DeviceScalar` with `CUBLAS_POINTER_MODE_DEVICE` for dot/norm reductions,
in-place `cwiseProduct` via NPP for Jacobi preconditioner application,
device-pointer-mode `scal` to avoid host sync on the beta update.
## Results
### Summary table
| Dataset | Cameras | Points | Obs | H size | H nnz | CG iters | CPU CG (ms) | GPU CG (ms) | Speedup |
|---------|---------|--------|-----|--------|-------|----------|-------------|-------------|---------|
| Ladybug-49 | 49 | 7,776 | 31,843 | 23,769 | 1.8M | 4,421 | 4,006 | 1,152 | **3.5x** |
| Ladybug-138 | 138 | 19,878 | 85,217 | 60,876 | 4.8M | 7,008 | 21,498 | 3,553 | **6.1x** |
| Ladybug-646 | 646 | 73,584 | 327,297 | 226,566 | 18.4M | 10,000* | 123,727 | 14,268 | **8.7x** |
| Dubrovnik-356 | 356 | 226,730 | 1,255,268 | 683,394 | 69.8M | 4,308 | 216,149 | 24,493 | **8.8x** |
\* Hit 10,000 iteration cap (poorly conditioned problem). Both CPU and GPU
hit the same cap, so timing comparison remains valid.
### Profile breakdown (Ladybug-138, nsys)
GPU kernel time is dominated by SpMV (91%). The remaining 9% is BLAS-1
operations (dot, axpy, scal) and NPP element-wise ops (cwiseProduct).
| Kernel | Time (ms) | % | Calls |
|--------|-----------|---|-------|
| cuSPARSE csrmv (SpMV) | 2507 | 91.3% | 7,006 |
| cuBLAS dot | 92 | 3.4% | 21,020 |
| cuBLAS axpy (device ptr) | 27 | 1.0% | 14,012 |
| cuSPARSE partition | 19 | 0.7% | 7,006 |
| NPP cwiseProduct | 16 + 13 | 1.1% | 14,011 + 7,006 |
| cuBLAS axpy (host ptr) | 12 | 0.5% | 7,005 |
| cuBLAS scal (device ptr) | 11 | 0.4% | 7,005 |
| NPP scalar ops | 7 | 0.2% | 7,006 |
### Optimizations applied
Three profiling-driven optimizations reduced GPU CG time by **1.8x**
(6.5s → 3.6s on Ladybug-138):
1. **In-place `cwiseProduct`**: The Jacobi preconditioner apply
(`z = invdiag .* residual`) was allocating a new DeviceMatrix every
iteration. Added `z.cwiseProduct(ctx, a, b)` that reuses `z`'s buffer.
Reduced `cudaMalloc` calls from 7,053 to 23 (saving 2.3s).
2. **`squaredNorm` via `dot(x,x)`**: cuBLAS `nrm2` uses a numerically
careful scaled-sum-of-squares algorithm (29µs/call). Replaced with
`dot(x,x)` (6.4µs/call) — 4.5x faster per call, saving ~320ms.
3. **Device-pointer `scal`**: `p *= beta` was converting `DeviceScalar`
beta to host (triggering a stream sync), then calling host-pointer-mode
scal. Added `operator*=(DeviceScalar)` that uses device-pointer-mode
scal, eliminating one sync per iteration. Halved `cudaStreamSynchronize`
calls from 14K to 7K.
### Observations
1. **GPU speedup scales with problem size**: from 3.5x on small problems
(24K variables) to 8.8x on large problems (683K variables). This is
expected — larger problems have more parallelism for the GPU to exploit.
2. **Iteration counts match**: CPU and GPU CG converge in the same number
of iterations (within 1%), confirming numerical equivalence.
3. **Bottleneck is SpMV**: CG iteration time is dominated (91%) by the
sparse matrix-vector product on H. Further speedup requires either
faster SpMV (e.g., block-sparse formats) or algorithmic improvements
(Schur complement, better preconditioners).
4. **Remaining overhead**: CUDA API calls (cudaMemcpyAsync for 8-byte
DeviceScalar transfers) account for ~50% of non-kernel time. Batching
multiple scalar reductions into a single transfer would help.
5. **Jacobi preconditioner is weak for BA**: The Ladybug-646 problem does
not converge in 10K iterations. Ceres uses block Jacobi or Schur
complement preconditioners that would also benefit from GPU acceleration.
### Scaling plot data
```
# n nnz_H cpu_ms gpu_ms speedup
23769 1793475 4006 1152 3.48
60876 4791762 21498 3553 6.05
226566 18387948 123727 14268 8.67
683394 69827066 216149 24493 8.82
```
## BAL datasets
Downloaded from http://grail.cs.washington.edu/projects/bal/
| File | Source |
|------|--------|
| problem-49-7776-pre.txt | Ladybug sequence |
| problem-138-19878-pre.txt | Ladybug sequence |
| problem-646-73584-pre.txt | Ladybug sequence |
| problem-356-226730-pre.txt | Dubrovnik reconstruction |
## Reproducing
```bash
# Build
cmake -G Ninja -B build-bench-gpu -S benchmarks/GPU -DCMAKE_CUDA_ARCHITECTURES=89
cmake --build build-bench-gpu --target bench_gpu_ba
# Download BAL datasets
wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-49-7776-pre.txt.bz2
wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-138-19878-pre.txt.bz2
wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-646-73584-pre.txt.bz2
wget http://grail.cs.washington.edu/projects/bal/data/dubrovnik/problem-356-226730-pre.txt.bz2
bunzip2 *.bz2
# Run (one at a time)
BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
BAL_FILE=problem-138-19878-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
BAL_FILE=problem-646-73584-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
BAL_FILE=problem-356-226730-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
```

View File

@@ -0,0 +1,533 @@
// Bundle Adjustment benchmark: GPU CG vs CPU CG on real BAL datasets.
//
// Tests Eigen's GPU CG pipeline (DeviceMatrix + GpuSparseContext + DeviceScalar)
// on the normal equations (J^T*J) arising from bundle adjustment problems.
//
// Reads a BAL (Bundle Adjustment in the Large) format file, computes the
// Jacobian and residual, forms the normal equations H = J^T*J + lambda*I,
// then solves H*dx = -J^T*r with both CPU and GPU conjugate gradients.
//
// BAL format: http://grail.cs.washington.edu/projects/bal/
//
// Usage:
// cmake --build build-bench-gpu --target bench_gpu_ba
//
// # Download a BAL dataset (bz2-compressed):
// wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-49-7776-pre.txt.bz2
// bunzip2 problem-49-7776-pre.txt.bz2
//
// # Run on a specific problem:
// BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba
//
// # Append results to the log:
// BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba \
// --benchmark_format=console 2>&1 | tee -a benchmarks/GPU/ba_results.log
#include <benchmark/benchmark.h>
#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
#include <Eigen/GPU>
#include <cmath>
#include <cstdio>
#include <fstream>
#include <string>
#include <vector>
using namespace Eigen;
// ============================================================================
// BAL problem data
// ============================================================================
struct BALProblem {
int num_cameras = 0;
int num_points = 0;
int num_observations = 0;
// Observations: (camera_idx, point_idx, observed_x, observed_y).
std::vector<int> camera_index;
std::vector<int> point_index;
std::vector<double> observations_x;
std::vector<double> observations_y;
// Camera parameters: 9 per camera (Rodrigues r[3], translation t[3], f, k1, k2).
std::vector<double> cameras; // [num_cameras * 9]
// 3D points: 3 per point.
std::vector<double> points; // [num_points * 3]
const double* camera(int i) const { return &cameras[i * 9]; }
const double* point(int i) const { return &points[i * 3]; }
bool load(const std::string& filename) {
std::ifstream in(filename);
if (!in) {
fprintf(stderr, "ERROR: Cannot open BAL file: %s\n", filename.c_str());
return false;
}
in >> num_cameras >> num_points >> num_observations;
if (!in || num_cameras <= 0 || num_points <= 0 || num_observations <= 0) {
fprintf(stderr, "ERROR: Invalid BAL header in %s\n", filename.c_str());
return false;
}
camera_index.resize(num_observations);
point_index.resize(num_observations);
observations_x.resize(num_observations);
observations_y.resize(num_observations);
for (int i = 0; i < num_observations; ++i) {
in >> camera_index[i] >> point_index[i] >> observations_x[i] >> observations_y[i];
}
cameras.resize(num_cameras * 9);
for (int i = 0; i < num_cameras * 9; ++i) {
in >> cameras[i];
}
points.resize(num_points * 3);
for (int i = 0; i < num_points * 3; ++i) {
in >> points[i];
}
if (!in) {
fprintf(stderr, "ERROR: Truncated BAL file: %s\n", filename.c_str());
return false;
}
fprintf(stderr, "Loaded BAL: %d cameras, %d points, %d observations\n", num_cameras, num_points, num_observations);
return true;
}
};
// ============================================================================
// Camera projection model (BAL convention)
// ============================================================================
// Rodrigues rotation: rotate point X by axis-angle vector omega.
static void rodrigues_rotate(const double* omega, const double* X, double* result) {
double theta2 = omega[0] * omega[0] + omega[1] * omega[1] + omega[2] * omega[2];
if (theta2 > 1e-30) {
double theta = std::sqrt(theta2);
double costh = std::cos(theta);
double sinth = std::sin(theta);
double k = (1.0 - costh) / theta2;
// Cross product omega x X.
double wx = omega[1] * X[2] - omega[2] * X[1];
double wy = omega[2] * X[0] - omega[0] * X[2];
double wz = omega[0] * X[1] - omega[1] * X[0];
// Dot product omega . X.
double dot = omega[0] * X[0] + omega[1] * X[1] + omega[2] * X[2];
result[0] = X[0] * costh + wx * (sinth / theta) + omega[0] * dot * k;
result[1] = X[1] * costh + wy * (sinth / theta) + omega[1] * dot * k;
result[2] = X[2] * costh + wz * (sinth / theta) + omega[2] * dot * k;
} else {
// Small angle: R ≈ I + [omega]×.
result[0] = X[0] + omega[1] * X[2] - omega[2] * X[1];
result[1] = X[1] + omega[2] * X[0] - omega[0] * X[2];
result[2] = X[2] + omega[0] * X[1] - omega[1] * X[0];
}
}
// Project a 3D point through a camera, returning the 2D residual.
// camera: [r0,r1,r2, t0,t1,t2, f, k1, k2]
// point: [X, Y, Z]
// observed: [ox, oy]
// residual: [rx, ry] = projected - observed
static void project(const double* camera, const double* point, const double* observed, double* residual) {
// Rotate.
double P[3];
rodrigues_rotate(camera, point, P);
// Translate.
P[0] += camera[3];
P[1] += camera[4];
P[2] += camera[5];
// Normalize (BAL convention: negative z).
double xp = -P[0] / P[2];
double yp = -P[1] / P[2];
// Radial distortion.
double r2 = xp * xp + yp * yp;
double distortion = 1.0 + camera[7] * r2 + camera[8] * r2 * r2;
// Apply focal length.
double predicted_x = camera[6] * distortion * xp;
double predicted_y = camera[6] * distortion * yp;
residual[0] = predicted_x - observed[0];
residual[1] = predicted_y - observed[1];
}
// ============================================================================
// Jacobian computation (numerical differentiation)
// ============================================================================
// Compute the 2x9 Jacobian block w.r.t. camera params and 2x3 block w.r.t.
// point coords for a single observation, using central finite differences.
static void compute_jacobian_block(const double* camera, const double* point, const double* observed,
double* J_cam, // 2x9, row-major
double* J_point) // 2x3, row-major
{
constexpr double eps = 1e-8;
// Camera parameters (9).
double cam_pert[9];
std::copy(camera, camera + 9, cam_pert);
for (int j = 0; j < 9; ++j) {
double orig = cam_pert[j];
double rp[2], rm[2];
cam_pert[j] = orig + eps;
project(cam_pert, point, observed, rp);
cam_pert[j] = orig - eps;
project(cam_pert, point, observed, rm);
cam_pert[j] = orig;
J_cam[0 * 9 + j] = (rp[0] - rm[0]) / (2.0 * eps);
J_cam[1 * 9 + j] = (rp[1] - rm[1]) / (2.0 * eps);
}
// Point coordinates (3).
double pt_pert[3];
std::copy(point, point + 3, pt_pert);
for (int j = 0; j < 3; ++j) {
double orig = pt_pert[j];
double rp[2], rm[2];
pt_pert[j] = orig + eps;
project(camera, pt_pert, observed, rp);
pt_pert[j] = orig - eps;
project(camera, pt_pert, observed, rm);
pt_pert[j] = orig;
J_point[0 * 3 + j] = (rp[0] - rm[0]) / (2.0 * eps);
J_point[1 * 3 + j] = (rp[1] - rm[1]) / (2.0 * eps);
}
}
// ============================================================================
// Build normal equations: H = J^T*J + lambda*I, g = -J^T*r
// ============================================================================
struct NormalEquations {
SparseMatrix<double, ColMajor, int> H;
VectorXd g;
VectorXd residual;
double residual_norm;
int jacobian_rows;
int jacobian_cols;
long jacobian_nnz;
};
static NormalEquations build_normal_equations(const BALProblem& problem, double lambda = 1.0) {
const int num_cam_params = problem.num_cameras * 9;
const int num_pt_params = problem.num_points * 3;
const int num_params = num_cam_params + num_pt_params;
const int num_residuals = problem.num_observations * 2;
fprintf(stderr, "Building Jacobian: %d x %d, %ld nonzeros\n", num_residuals, num_params,
(long)problem.num_observations * 24);
// Build J as a triplet list.
using Triplet = Eigen::Triplet<double>;
std::vector<Triplet> triplets;
triplets.reserve(problem.num_observations * 24); // 2 rows × 12 nonzeros = 24 entries per obs
VectorXd residual(num_residuals);
for (int obs = 0; obs < problem.num_observations; ++obs) {
int ci = problem.camera_index[obs];
int pi = problem.point_index[obs];
double observed[2] = {problem.observations_x[obs], problem.observations_y[obs]};
// Compute residual.
double r[2];
project(problem.camera(ci), problem.point(pi), observed, r);
residual[obs * 2 + 0] = r[0];
residual[obs * 2 + 1] = r[1];
// Compute Jacobian blocks.
double J_cam[18], J_pt[6]; // 2x9 and 2x3
compute_jacobian_block(problem.camera(ci), problem.point(pi), observed, J_cam, J_pt);
// Insert camera block: rows [2*obs, 2*obs+1], cols [9*ci, 9*ci+8].
for (int row = 0; row < 2; ++row) {
for (int col = 0; col < 9; ++col) {
double val = J_cam[row * 9 + col];
if (val != 0.0) {
triplets.emplace_back(obs * 2 + row, ci * 9 + col, val);
}
}
}
// Insert point block: rows [2*obs, 2*obs+1], cols [num_cam_params + 3*pi, ...].
for (int row = 0; row < 2; ++row) {
for (int col = 0; col < 3; ++col) {
double val = J_pt[row * 3 + col];
if (val != 0.0) {
triplets.emplace_back(obs * 2 + row, num_cam_params + pi * 3 + col, val);
}
}
}
}
// Build sparse Jacobian.
SparseMatrix<double, ColMajor, int> J(num_residuals, num_params);
J.setFromTriplets(triplets.begin(), triplets.end());
fprintf(stderr, "Jacobian: %dx%d, nnz=%ld\n", (int)J.rows(), (int)J.cols(), (long)J.nonZeros());
// Form normal equations: H = J^T*J + lambda*I.
SparseMatrix<double, ColMajor, int> H = (J.transpose() * J).pruned();
// Add Levenberg-Marquardt damping.
for (int i = 0; i < num_params; ++i) {
H.coeffRef(i, i) += lambda;
}
H.makeCompressed();
// Gradient: g = -J^T * r.
VectorXd g = -(J.transpose() * residual);
double rnorm = residual.norm();
fprintf(stderr, "Normal equations: H is %dx%d, nnz=%ld, |r|=%.6e\n", (int)H.rows(), (int)H.cols(), (long)H.nonZeros(),
rnorm);
return {std::move(H), std::move(g), std::move(residual), rnorm, num_residuals, num_params, (long)J.nonZeros()};
}
// ============================================================================
// Global problem state (loaded once before benchmarks run)
// ============================================================================
static BALProblem g_problem;
static NormalEquations g_neq;
static bool g_loaded = false;
static void ensure_loaded() {
if (g_loaded) return;
const char* bal_file = std::getenv("BAL_FILE");
if (!bal_file) {
fprintf(stderr,
"ERROR: Set BAL_FILE environment variable to a BAL problem file.\n"
" Download from: http://grail.cs.washington.edu/projects/bal/\n"
" Example:\n"
" wget http://grail.cs.washington.edu/projects/bal/data/ladybug/"
"problem-49-7776-pre.txt.bz2\n"
" bunzip2 problem-49-7776-pre.txt.bz2\n"
" BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba\n");
std::exit(1);
}
if (!g_problem.load(bal_file)) {
std::exit(1);
}
g_neq = build_normal_equations(g_problem);
g_loaded = true;
}
// ============================================================================
// CPU CG benchmark
// ============================================================================
static void BM_BA_CPU_CG(benchmark::State& state) {
ensure_loaded();
const auto& H = g_neq.H;
const auto& g = g_neq.g;
ConjugateGradient<SparseMatrix<double, ColMajor, int>, Lower | Upper> cg;
cg.setMaxIterations(10000);
cg.setTolerance(1e-8);
cg.compute(H);
int last_iters = 0;
double last_error = 0;
for (auto _ : state) {
VectorXd dx = cg.solve(g);
benchmark::DoNotOptimize(dx.data());
last_iters = cg.iterations();
last_error = cg.error();
}
state.counters["n"] = H.rows();
state.counters["nnz"] = H.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
state.counters["cameras"] = g_problem.num_cameras;
state.counters["points"] = g_problem.num_points;
state.counters["observations"] = g_problem.num_observations;
}
// ============================================================================
// GPU CG benchmark (with Jacobi preconditioner)
// ============================================================================
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 1);
cudaFree(p);
done = true;
}
}
static void BM_BA_GPU_CG(benchmark::State& state) {
ensure_loaded();
cuda_warmup();
const auto& H = g_neq.H;
const auto& g = g_neq.g;
const Index n = H.rows();
// Extract inverse diagonal (Jacobi preconditioner).
using SpMat = SparseMatrix<double, ColMajor, int>;
VectorXd invdiag(n);
for (Index j = 0; j < H.outerSize(); ++j) {
SpMat::InnerIterator it(H, j);
while (it && it.index() != j) ++it;
if (it && it.index() == j && it.value() != 0.0)
invdiag(j) = 1.0 / it.value();
else
invdiag(j) = 1.0;
}
// Set up GPU context and upload data.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<double> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(H);
auto d_invdiag = DeviceMatrix<double>::fromHost(invdiag, ctx.stream());
auto d_g = DeviceMatrix<double>::fromHost(g, ctx.stream());
int last_iters = 0;
double last_error = 0;
for (auto _ : state) {
DeviceMatrix<double> d_x(n, 1);
d_x.setZero(ctx);
DeviceMatrix<double> residual(n, 1);
residual.copyFrom(ctx, d_g);
double rhsNorm2 = d_g.squaredNorm(ctx);
double threshold = 1e-8 * 1e-8 * rhsNorm2;
double residualNorm2 = residual.squaredNorm(ctx);
DeviceMatrix<double> p = d_invdiag.cwiseProduct(ctx, residual);
DeviceMatrix<double> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index i = 0;
Index maxIters = 10000;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
z.cwiseProduct(ctx, d_invdiag, residual); // in-place, no allocation
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= beta; // device-pointer scal, no host sync
p += z;
i++;
}
benchmark::DoNotOptimize(d_x.data());
last_iters = i;
last_error = std::sqrt(residualNorm2 / rhsNorm2);
}
GpuContext::setThreadLocal(nullptr);
state.counters["n"] = n;
state.counters["nnz"] = H.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
state.counters["cameras"] = g_problem.num_cameras;
state.counters["points"] = g_problem.num_points;
state.counters["observations"] = g_problem.num_observations;
}
// ============================================================================
// CPU CG with Jacobi preconditioner (apples-to-apples comparison)
// ============================================================================
static void BM_BA_CPU_CG_Jacobi(benchmark::State& state) {
ensure_loaded();
const auto& H = g_neq.H;
const auto& g = g_neq.g;
// Eigen's DiagonalPreconditioner is effectively Jacobi.
ConjugateGradient<SparseMatrix<double, ColMajor, int>, Lower | Upper> cg;
cg.setMaxIterations(10000);
cg.setTolerance(1e-8);
cg.compute(H);
int last_iters = 0;
double last_error = 0;
for (auto _ : state) {
VectorXd dx = cg.solve(g);
benchmark::DoNotOptimize(dx.data());
last_iters = cg.iterations();
last_error = cg.error();
}
state.counters["n"] = H.rows();
state.counters["nnz"] = H.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
}
// ============================================================================
// Register benchmarks
// ============================================================================
BENCHMARK(BM_BA_CPU_CG)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_BA_CPU_CG_Jacobi)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_BA_GPU_CG)->Unit(benchmark::kMillisecond);
// ============================================================================
// Custom main: print summary after benchmarks
// ============================================================================
int main(int argc, char** argv) {
benchmark::Initialize(&argc, argv);
// Print problem info before benchmarks.
const char* bal_file = std::getenv("BAL_FILE");
if (bal_file) {
ensure_loaded();
fprintf(stderr,
"\n"
"=== Bundle Adjustment GPU CG Benchmark ===\n"
"BAL file: %s\n"
"Cameras: %d\n"
"Points: %d\n"
"Observations: %d\n"
"J size: %d x %d, nnz=%ld\n"
"H size: %d x %d, nnz=%ld\n"
"|residual|: %.6e\n"
"==========================================\n\n",
bal_file, g_problem.num_cameras, g_problem.num_points, g_problem.num_observations, g_neq.jacobian_rows,
g_neq.jacobian_cols, g_neq.jacobian_nnz, (int)g_neq.H.rows(), (int)g_neq.H.cols(), (long)g_neq.H.nonZeros(),
g_neq.residual_norm);
}
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();
return 0;
}

View File

@@ -0,0 +1,291 @@
// Benchmark: GPU Conjugate Gradient via DeviceMatrix operators.
//
// Shows the path to running Eigen's CG on GPU with minimal code changes.
// The DeviceMatrix benchmark mirrors Eigen's conjugate_gradient() line-by-line.
// A raw cuBLAS device-pointer-mode implementation is included as a lower bound.
//
// The only change needed in Eigen's CG template to support DeviceMatrix:
// Line 34: typedef Dest VectorType; (instead of Matrix<Scalar, Dynamic, 1>)
//
// Usage:
// cmake --build build-bench-gpu --target bench_gpu_cg_sync
// ./build-bench-gpu/bench_gpu_cg_sync
#include <benchmark/benchmark.h>
#include <Eigen/Sparse>
#include <Eigen/GPU>
#include <cusparse.h>
using namespace Eigen;
using Scalar = double;
using RealScalar = double;
using Vec = Matrix<Scalar, Dynamic, 1>;
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
static SpMat make_spd(Index n) {
SpMat A(n, n);
A.reserve(VectorXi::Constant(n, 3));
for (Index i = 0; i < n; ++i) {
A.insert(i, i) = 4.0;
if (i > 0) A.insert(i, i - 1) = -1.0;
if (i < n - 1) A.insert(i, i + 1) = -1.0;
}
A.makeCompressed();
return A;
}
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 1);
cudaFree(p);
done = true;
}
}
// ==========================================================================
// GPU CG using DeviceMatrix operators — mirrors Eigen's conjugate_gradient()
// ==========================================================================
//
// Compare with Eigen/src/IterativeLinearSolvers/ConjugateGradient.h lines 29-84.
// Left column: Eigen CG code. Right column: this benchmark.
//
// Eigen CG GPU CG (this benchmark)
// -------- -----------------------
// VectorType residual = rhs - mat * x; residual.copyFrom(ctx, rhs); [x=0 so r=b]
// RealScalar rhsNorm2 = rhs.sqNorm(); RealScalar rhsNorm2 = rhs.squaredNorm();
// ...
// tmp.noalias() = mat * p; tmp.noalias() = mat * p; [identical]
// Scalar alpha = absNew / p.dot(tmp); Scalar alpha = absNew / p.dot(tmp); [identical]
// x += alpha * p; x += alpha * p; [identical]
// residual -= alpha * tmp; residual -= alpha * tmp; [identical]
// residualNorm2 = residual.sqNorm(); residualNorm2 = residual.squaredNorm(); [identical]
// ...
// p = z + beta * p; p *= beta; p += z; [equivalent, no alloc]
static void BM_CG_DeviceMatrixOps(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
SpMat A = make_spd(n);
Vec b = Vec::Random(n);
// One shared context: SpMV + BLAS-1 on same stream, zero event overhead.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<Scalar> spmv(ctx);
auto mat = spmv.deviceView(A);
// Upload RHS once.
auto rhs = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
for (auto _ : state) {
// --- Eigen CG lines 34-63: initialization ---
// typedef Dest VectorType; // GPU CHANGE: was Matrix<Scalar,Dynamic,1>
// VectorType residual = rhs - mat * x; // x=0, so residual = rhs
DeviceMatrix<Scalar> x(n, 1);
x.setZero();
DeviceMatrix<Scalar> residual(n, 1);
residual.copyFrom(ctx, rhs);
// RealScalar rhsNorm2 = rhs.squaredNorm();
RealScalar rhsNorm2 = rhs.squaredNorm();
if (rhsNorm2 == 0) continue;
RealScalar tol = 1e-10;
const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
RealScalar threshold = numext::maxi(RealScalar(tol * tol * rhsNorm2), considerAsZero);
// RealScalar residualNorm2 = residual.squaredNorm();
RealScalar residualNorm2 = residual.squaredNorm();
if (residualNorm2 < threshold) continue;
// VectorType p(n);
// p = precond.solve(residual); // no preconditioner: p = residual
DeviceMatrix<Scalar> p(n, 1);
p.copyFrom(ctx, residual);
// VectorType z(n), tmp(n);
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
// auto absNew = numext::real(residual.dot(p));
// DeviceScalar — stays on device, no sync.
auto absNew = residual.dot(p); // DeviceScalar, no sync
// while (i < maxIters) {
Index maxIters = 200;
Index i = 0;
while (i < maxIters) {
// tmp.noalias() = mat * p;
tmp.noalias() = mat * p; // SpMV, device-resident
// auto alpha = absNew / p.dot(tmp);
// DeviceScalar / DeviceScalar → device kernel, no sync!
auto alpha = absNew / p.dot(tmp); // DeviceScalar, no sync
// x += alpha * p;
// DeviceScalar * DeviceMatrix → device-pointer axpy, no sync!
x += alpha * p;
// residual -= alpha * tmp;
residual -= alpha * tmp; // device-pointer axpy, no sync
// residualNorm2 = residual.squaredNorm();
residualNorm2 = residual.squaredNorm(); // THE one sync per iteration
// if (residualNorm2 < threshold) break;
if (residualNorm2 < threshold) break;
// z = precond.solve(residual);
z.copyFrom(ctx, residual); // no preconditioner
// auto absOld = std::move(absNew);
auto absOld = std::move(absNew); // no sync, no alloc
// absNew = numext::real(residual.dot(z));
absNew = residual.dot(z); // DeviceScalar, no sync
// auto beta = absNew / absOld;
// DeviceScalar / DeviceScalar → device kernel, no sync!
auto beta = absNew / absOld; // DeviceScalar, no sync
// p = z + beta * p;
p *= beta; // device-pointer scal, no host sync
p += z;
i++;
}
}
GpuContext::setThreadLocal(nullptr);
state.SetItemsProcessed(state.iterations() * 200);
}
BENCHMARK(BM_CG_DeviceMatrixOps)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
// ==========================================================================
// Raw cuBLAS device-pointer-mode CG (1 sync/iter) — performance lower bound
// ==========================================================================
__global__ void scalar_div_kernel(const Scalar* a, const Scalar* b, Scalar* out) { *out = *a / *b; }
__global__ void scalar_neg_kernel(const Scalar* in, Scalar* out) { *out = -(*in); }
static void BM_CG_DevicePointerMode(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
const int maxIters = 200;
SpMat A = make_spd(n);
Vec b = Vec::Random(n);
cudaStream_t stream;
cudaStreamCreate(&stream);
cublasHandle_t cublas;
cublasCreate(&cublas);
cublasSetStream(cublas, stream);
cusparseHandle_t cusparse;
cusparseCreate(&cusparse);
cusparseSetStream(cusparse, stream);
internal::DeviceBuffer d_outer((n + 1) * sizeof(int));
internal::DeviceBuffer d_inner(A.nonZeros() * sizeof(int));
internal::DeviceBuffer d_vals(A.nonZeros() * sizeof(Scalar));
cudaMemcpy(d_outer.ptr, A.outerIndexPtr(), (n + 1) * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_inner.ptr, A.innerIndexPtr(), A.nonZeros() * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_vals.ptr, A.valuePtr(), A.nonZeros() * sizeof(Scalar), cudaMemcpyHostToDevice);
cusparseSpMatDescr_t matA;
cusparseCreateCsc(&matA, n, n, A.nonZeros(), d_outer.ptr, d_inner.ptr, d_vals.ptr, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
internal::DeviceBuffer d_tmp_buf(n * sizeof(Scalar));
cusparseDnVecDescr_t tmp_x, tmp_y;
cusparseCreateDnVec(&tmp_x, n, d_tmp_buf.ptr, CUDA_R_64F);
cusparseCreateDnVec(&tmp_y, n, d_tmp_buf.ptr, CUDA_R_64F);
Scalar spmv_alpha = 1.0, spmv_beta = 0.0;
size_t ws_size = 0;
cusparseSpMV_bufferSize(cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &spmv_alpha, matA, tmp_x, &spmv_beta, tmp_y,
CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &ws_size);
internal::DeviceBuffer d_workspace(ws_size);
cusparseDestroyDnVec(tmp_x);
cusparseDestroyDnVec(tmp_y);
internal::DeviceBuffer d_x(n * sizeof(Scalar)), d_r(n * sizeof(Scalar));
internal::DeviceBuffer d_p(n * sizeof(Scalar)), d_tmp(n * sizeof(Scalar));
internal::DeviceBuffer d_b(n * sizeof(Scalar));
internal::DeviceBuffer d_absNew(sizeof(Scalar)), d_absOld(sizeof(Scalar));
internal::DeviceBuffer d_pdot(sizeof(Scalar)), d_alpha(sizeof(Scalar));
internal::DeviceBuffer d_neg_alpha(sizeof(Scalar)), d_beta(sizeof(Scalar));
internal::DeviceBuffer d_rnorm(sizeof(RealScalar));
cudaMemcpy(d_b.ptr, b.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice);
auto spmv = [&](Scalar* x_ptr, Scalar* y_ptr) {
cusparseDnVecDescr_t vx, vy;
cusparseCreateDnVec(&vx, n, x_ptr, CUDA_R_64F);
cusparseCreateDnVec(&vy, n, y_ptr, CUDA_R_64F);
cusparseSpMV(cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &spmv_alpha, matA, vx, &spmv_beta, vy, CUDA_R_64F,
CUSPARSE_SPMV_ALG_DEFAULT, d_workspace.ptr);
cusparseDestroyDnVec(vx);
cusparseDestroyDnVec(vy);
};
for (auto _ : state) {
cudaMemsetAsync(static_cast<Scalar*>(d_x.ptr), 0, n * sizeof(Scalar), stream);
cudaMemcpyAsync(d_r.ptr, d_b.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
cudaMemcpyAsync(d_p.ptr, d_b.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_DEVICE);
cublasDdot(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_p.ptr), 1,
static_cast<Scalar*>(d_absNew.ptr));
for (int i = 0; i < maxIters; ++i) {
spmv(static_cast<Scalar*>(d_p.ptr), static_cast<Scalar*>(d_tmp.ptr));
cublasDdot(cublas, n, static_cast<Scalar*>(d_p.ptr), 1, static_cast<Scalar*>(d_tmp.ptr), 1,
static_cast<Scalar*>(d_pdot.ptr));
scalar_div_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_absNew.ptr), static_cast<Scalar*>(d_pdot.ptr),
static_cast<Scalar*>(d_alpha.ptr));
scalar_neg_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_alpha.ptr), static_cast<Scalar*>(d_neg_alpha.ptr));
cublasDaxpy(cublas, n, static_cast<Scalar*>(d_alpha.ptr), static_cast<Scalar*>(d_p.ptr), 1,
static_cast<Scalar*>(d_x.ptr), 1);
cublasDaxpy(cublas, n, static_cast<Scalar*>(d_neg_alpha.ptr), static_cast<Scalar*>(d_tmp.ptr), 1,
static_cast<Scalar*>(d_r.ptr), 1);
cublasDnrm2(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<RealScalar*>(d_rnorm.ptr));
RealScalar rnorm;
cudaMemcpyAsync(&rnorm, d_rnorm.ptr, sizeof(RealScalar), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
if (rnorm * rnorm < 1e-20) break;
cudaMemcpyAsync(d_absOld.ptr, d_absNew.ptr, sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
cublasDdot(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_r.ptr), 1,
static_cast<Scalar*>(d_absNew.ptr));
scalar_div_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_absNew.ptr), static_cast<Scalar*>(d_absOld.ptr),
static_cast<Scalar*>(d_beta.ptr));
cublasDscal(cublas, n, static_cast<Scalar*>(d_beta.ptr), static_cast<Scalar*>(d_p.ptr), 1);
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_HOST);
Scalar one = 1.0;
cublasDaxpy(cublas, n, &one, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_p.ptr), 1);
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_DEVICE);
}
cudaStreamSynchronize(stream);
}
state.SetItemsProcessed(state.iterations() * maxIters);
cusparseDestroySpMat(matA);
cusparseDestroy(cusparse);
cublasDestroy(cublas);
cudaStreamDestroy(stream);
}
BENCHMARK(BM_CG_DevicePointerMode)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);

View File

@@ -0,0 +1,216 @@
// Benchmark: GPU CG vs CPU CG on realistic sparse systems.
//
// Tests 2D Laplacian (5-point stencil) and 3D Laplacian (7-point stencil)
// in both float and double precision.
//
// Usage:
// cmake --build build-bench-gpu --target bench_gpu_cg_vs_cpu
// ./build-bench-gpu/bench_gpu_cg_vs_cpu
#include <benchmark/benchmark.h>
#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Sparse matrix generators -----------------------------------------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_laplacian_2d(int grid_n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
const int n = grid_n * grid_n;
SpMat A(n, n);
A.reserve(VectorXi::Constant(n, 5));
for (int i = 0; i < grid_n; ++i) {
for (int j = 0; j < grid_n; ++j) {
int idx = i * grid_n + j;
A.insert(idx, idx) = Scalar(4);
if (i > 0) A.insert(idx, idx - grid_n) = Scalar(-1);
if (i < grid_n - 1) A.insert(idx, idx + grid_n) = Scalar(-1);
if (j > 0) A.insert(idx, idx - 1) = Scalar(-1);
if (j < grid_n - 1) A.insert(idx, idx + 1) = Scalar(-1);
}
}
A.makeCompressed();
return A;
}
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_laplacian_3d(int grid_n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
const int n = grid_n * grid_n * grid_n;
const int n2 = grid_n * grid_n;
SpMat A(n, n);
A.reserve(VectorXi::Constant(n, 7));
for (int i = 0; i < grid_n; ++i) {
for (int j = 0; j < grid_n; ++j) {
for (int k = 0; k < grid_n; ++k) {
int idx = i * n2 + j * grid_n + k;
A.insert(idx, idx) = Scalar(6);
if (i > 0) A.insert(idx, idx - n2) = Scalar(-1);
if (i < grid_n - 1) A.insert(idx, idx + n2) = Scalar(-1);
if (j > 0) A.insert(idx, idx - grid_n) = Scalar(-1);
if (j < grid_n - 1) A.insert(idx, idx + grid_n) = Scalar(-1);
if (k > 0) A.insert(idx, idx - 1) = Scalar(-1);
if (k < grid_n - 1) A.insert(idx, idx + 1) = Scalar(-1);
}
}
}
A.makeCompressed();
return A;
}
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 1);
cudaFree(p);
done = true;
}
}
// ---- CPU CG -----------------------------------------------------------------
template <typename Scalar, typename MatGen>
void run_cpu_cg(benchmark::State& state, MatGen make_matrix) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
const int grid_n = state.range(0);
SpMat A = make_matrix(grid_n);
Vec b = Vec::Random(A.rows());
ConjugateGradient<SpMat, Lower | Upper> cg;
cg.setMaxIterations(10000);
cg.setTolerance(RealScalar(1e-8));
cg.compute(A);
int last_iters = 0;
for (auto _ : state) {
Vec x = cg.solve(b);
benchmark::DoNotOptimize(x.data());
last_iters = cg.iterations();
}
state.counters["n"] = A.rows();
state.counters["nnz"] = A.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = cg.error();
}
// ---- GPU CG -----------------------------------------------------------------
template <typename Scalar, typename MatGen>
void run_gpu_cg(benchmark::State& state, MatGen make_matrix) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
cuda_warmup();
const int grid_n = state.range(0);
SpMat A = make_matrix(grid_n);
const Index n = A.rows();
Vec b = Vec::Random(n);
// Extract inverse diagonal.
Vec invdiag(n);
for (Index j = 0; j < A.outerSize(); ++j) {
typename SpMat::InnerIterator it(A, j);
while (it && it.index() != j) ++it;
if (it && it.index() == j && it.value() != Scalar(0))
invdiag(j) = Scalar(1) / it.value();
else
invdiag(j) = Scalar(1);
}
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<Scalar> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(A);
auto d_invdiag = DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
int last_iters = 0;
RealScalar last_error = 0;
for (auto _ : state) {
DeviceMatrix<Scalar> d_x(n, 1);
d_x.setZero(ctx);
DeviceMatrix<Scalar> residual(n, 1);
residual.copyFrom(ctx, d_b);
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
RealScalar tol = RealScalar(1e-8);
RealScalar threshold = tol * tol * rhsNorm2;
RealScalar residualNorm2 = residual.squaredNorm(ctx);
DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index i = 0;
Index maxIters = 10000;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
z.cwiseProduct(ctx, d_invdiag, residual);
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= beta;
p += z;
i++;
}
benchmark::DoNotOptimize(d_x.data());
last_iters = i;
last_error = numext::sqrt(residualNorm2 / rhsNorm2);
}
GpuContext::setThreadLocal(nullptr);
state.counters["n"] = n;
state.counters["nnz"] = A.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
}
// ---- 2D Laplacian, double ---------------------------------------------------
static void BM_CG_CPU_2D_double(benchmark::State& state) { run_cpu_cg<double>(state, make_laplacian_2d<double>); }
static void BM_CG_GPU_2D_double(benchmark::State& state) { run_gpu_cg<double>(state, make_laplacian_2d<double>); }
BENCHMARK(BM_CG_CPU_2D_double)->ArgsProduct({{32, 64, 128, 256, 512}});
BENCHMARK(BM_CG_GPU_2D_double)->ArgsProduct({{32, 64, 128, 256, 512}});
// ---- 2D Laplacian, float ----------------------------------------------------
static void BM_CG_CPU_2D_float(benchmark::State& state) { run_cpu_cg<float>(state, make_laplacian_2d<float>); }
static void BM_CG_GPU_2D_float(benchmark::State& state) { run_gpu_cg<float>(state, make_laplacian_2d<float>); }
BENCHMARK(BM_CG_CPU_2D_float)->ArgsProduct({{32, 64, 128, 256, 512}});
BENCHMARK(BM_CG_GPU_2D_float)->ArgsProduct({{32, 64, 128, 256, 512}});
// ---- 3D Laplacian, double ---------------------------------------------------
static void BM_CG_CPU_3D_double(benchmark::State& state) { run_cpu_cg<double>(state, make_laplacian_3d<double>); }
static void BM_CG_GPU_3D_double(benchmark::State& state) { run_gpu_cg<double>(state, make_laplacian_3d<double>); }
BENCHMARK(BM_CG_CPU_3D_double)->ArgsProduct({{16, 32, 48, 64}});
BENCHMARK(BM_CG_GPU_3D_double)->ArgsProduct({{16, 32, 48, 64}});
// ---- 3D Laplacian, float ----------------------------------------------------
static void BM_CG_CPU_3D_float(benchmark::State& state) { run_cpu_cg<float>(state, make_laplacian_3d<float>); }
static void BM_CG_GPU_3D_float(benchmark::State& state) { run_gpu_cg<float>(state, make_laplacian_3d<float>); }
BENCHMARK(BM_CG_CPU_3D_float)->ArgsProduct({{16, 32, 48, 64}});
BENCHMARK(BM_CG_GPU_3D_float)->ArgsProduct({{16, 32, 48, 64}});

View File

@@ -481,13 +481,13 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
ei_add_test(gpu_basic) ei_add_test(gpu_basic)
ei_add_test(gpu_library_example "" "CUDA::cusolver") ei_add_test(gpu_library_example "" "CUDA::cusolver")
# DeviceMatrix tests: only CUDA runtime, no NVIDIA libraries. # DeviceMatrix tests: CUDA runtime + cuBLAS + cuSOLVER (for BLAS-1 ops via GpuContext).
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
add_executable(gpu_device_matrix gpu_device_matrix.cpp) add_executable(gpu_device_matrix gpu_device_matrix.cpp)
target_include_directories(gpu_device_matrix PRIVATE target_include_directories(gpu_device_matrix PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include" "${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CMAKE_CURRENT_BINARY_DIR}") "${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart) target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart CUDA::cublas CUDA::cusolver CUDA::npps CUDA::nppc)
target_compile_definitions(gpu_device_matrix PRIVATE target_compile_definitions(gpu_device_matrix PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE} EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1) EIGEN_TEST_PART_ALL=1)
@@ -575,7 +575,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
"${CUDA_TOOLKIT_ROOT_DIR}/include" "${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CMAKE_CURRENT_BINARY_DIR}") "${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(gpu_cusparse_spmv target_link_libraries(gpu_cusparse_spmv
Eigen3::Eigen CUDA::cudart CUDA::cusparse) Eigen3::Eigen CUDA::cudart CUDA::cusparse CUDA::cublas CUDA::cusolver)
target_compile_definitions(gpu_cusparse_spmv PRIVATE target_compile_definitions(gpu_cusparse_spmv PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE} EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1) EIGEN_TEST_PART_ALL=1)
@@ -584,6 +584,23 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
add_dependencies(buildtests_gpu gpu_cusparse_spmv) add_dependencies(buildtests_gpu gpu_cusparse_spmv)
set_property(TEST gpu_cusparse_spmv APPEND PROPERTY LABELS "Official;gpu") set_property(TEST gpu_cusparse_spmv APPEND PROPERTY LABELS "Official;gpu")
set_property(TEST gpu_cusparse_spmv PROPERTY SKIP_RETURN_CODE 77) set_property(TEST gpu_cusparse_spmv PROPERTY SKIP_RETURN_CODE 77)
# End-to-end GPU CG test: Eigen's ConjugateGradient with DeviceMatrix.
add_executable(gpu_cg gpu_cg.cpp)
target_include_directories(gpu_cg PRIVATE
"${CUDA_TOOLKIT_ROOT_DIR}/include"
"${CMAKE_CURRENT_BINARY_DIR}")
target_link_libraries(gpu_cg
Eigen3::Eigen CUDA::cudart CUDA::cusparse CUDA::cublas CUDA::cusolver CUDA::npps CUDA::nppc)
target_compile_definitions(gpu_cg PRIVATE
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
EIGEN_TEST_PART_ALL=1)
add_test(NAME gpu_cg COMMAND gpu_cg)
add_dependencies(buildtests gpu_cg)
add_dependencies(buildtests_gpu gpu_cg)
set_property(TEST gpu_cg APPEND PROPERTY LABELS "Official;gpu")
set_property(TEST gpu_cg PROPERTY SKIP_RETURN_CODE 77)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
endif() endif()

224
test/gpu_cg.cpp Normal file
View File

@@ -0,0 +1,224 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// End-to-end test: CG algorithm running on GPU via DeviceMatrix.
//
// Uses DeviceSparseView for SpMV, DeviceMatrix for vectors, DeviceScalar
// for deferred reductions. Verifies correctness against CPU ConjugateGradient.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a sparse SPD matrix --------------------------------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat R(n, n);
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
for (Index j = 0; j < n; ++j) {
for (Index i = 0; i < n; ++i) {
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
}
}
}
R.makeCompressed();
SpMat A = R.adjoint() * R;
for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
A.makeCompressed();
return A;
}
// ---- GPU CG without preconditioner ------------------------------------------
template <typename Scalar>
void test_gpu_cg(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
// CPU reference (identity preconditioner to match GPU).
ConjugateGradient<SpMat, Lower | Upper, IdentityPreconditioner> cpu_cg;
cpu_cg.setMaxIterations(1000);
cpu_cg.setTolerance(RealScalar(1e-8));
cpu_cg.compute(A);
Vec x_cpu = cpu_cg.solve(b);
VERIFY_IS_EQUAL(cpu_cg.info(), Success);
// GPU CG: mirrors Eigen's conjugate_gradient() using DeviceMatrix ops.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<Scalar> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(A);
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
DeviceMatrix<Scalar> d_x(n, 1);
d_x.setZero(ctx);
// r = b (since x=0)
DeviceMatrix<Scalar> residual(n, 1);
residual.copyFrom(ctx, d_b);
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
RealScalar tol = RealScalar(1e-8);
RealScalar threshold = tol * tol * rhsNorm2;
RealScalar residualNorm2 = residual.squaredNorm(ctx);
// p = r (no preconditioner)
DeviceMatrix<Scalar> p(n, 1);
p.copyFrom(ctx, residual);
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index maxIters = 1000;
Index i = 0;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
// z = r (no preconditioner)
z.copyFrom(ctx, residual);
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= Scalar(beta);
p += z;
i++;
}
GpuContext::setThreadLocal(nullptr);
Vec x_gpu = d_x.toHost(ctx.stream());
// Verify residual.
Vec r = A * x_gpu - b;
RealScalar relres = r.norm() / b.norm();
VERIFY(relres < RealScalar(1e-6));
// Compare with CPU.
RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
}
// ---- GPU CG with Jacobi preconditioner --------------------------------------
template <typename Scalar>
void test_gpu_cg_jacobi(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
// CPU reference.
ConjugateGradient<SpMat, Lower | Upper> cpu_cg;
cpu_cg.setMaxIterations(1000);
cpu_cg.setTolerance(RealScalar(1e-8));
cpu_cg.compute(A);
Vec x_cpu = cpu_cg.solve(b);
// Extract inverse diagonal.
Vec invdiag(n);
for (Index j = 0; j < A.outerSize(); ++j) {
typename SpMat::InnerIterator it(A, j);
while (it && it.index() != j) ++it;
if (it && it.index() == j && it.value() != Scalar(0))
invdiag(j) = Scalar(1) / it.value();
else
invdiag(j) = Scalar(1);
}
// GPU CG with Jacobi preconditioner.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<Scalar> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(A);
auto d_invdiag = DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
DeviceMatrix<Scalar> d_x(n, 1);
d_x.setZero(ctx);
DeviceMatrix<Scalar> residual(n, 1);
residual.copyFrom(ctx, d_b);
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
RealScalar tol = RealScalar(1e-8);
RealScalar threshold = tol * tol * rhsNorm2;
RealScalar residualNorm2 = residual.squaredNorm(ctx);
// p = precond.solve(r) = invdiag .* r
DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index maxIters = 1000;
Index i = 0;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
// z = precond.solve(r) = invdiag .* r
z.cwiseProduct(ctx, d_invdiag, residual);
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= beta;
p += z;
i++;
}
GpuContext::setThreadLocal(nullptr);
Vec x_gpu = d_x.toHost(ctx.stream());
Vec r = A * x_gpu - b;
RealScalar relres = r.norm() / b.norm();
VERIFY(relres < RealScalar(1e-6));
RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
}
EIGEN_DECLARE_TEST(gpu_cg) {
CALL_SUBTEST(test_gpu_cg<double>(64));
CALL_SUBTEST(test_gpu_cg<double>(256));
CALL_SUBTEST(test_gpu_cg<float>(64));
CALL_SUBTEST(test_gpu_cg_jacobi<double>(64));
CALL_SUBTEST(test_gpu_cg_jacobi<double>(256));
CALL_SUBTEST(test_gpu_cg_jacobi<float>(64));
}

View File

@@ -180,6 +180,105 @@ void test_empty() {
VERIFY_IS_EQUAL(y.size(), 0); VERIFY_IS_EQUAL(y.size(), 0);
} }
// ---- DeviceMatrix SpMV (no host roundtrip) ----------------------------------
template <typename Scalar>
void test_spmv_device(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(n, n);
Vec x = Vec::Random(n);
// Use shared GpuContext for same-stream execution.
GpuContext gpu_ctx;
GpuSparseContext<Scalar> ctx(gpu_ctx);
auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
DeviceMatrix<Scalar> d_y;
ctx.multiply(A, d_x, d_y);
Vec y_gpu = d_y.toHost(gpu_ctx.stream());
Vec y_cpu = A * x;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- Expression syntax: d_y = d_A * d_x ------------------------------------
template <typename Scalar>
void test_spmv_expr(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_sparse<Scalar>(n, n);
Vec x = Vec::Random(n);
GpuContext gpu_ctx;
GpuSparseContext<Scalar> ctx(gpu_ctx);
// Upload sparse matrix and create device view.
auto d_A = ctx.deviceView(A);
// Upload x.
auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
// Expression syntax: d_y = d_A * d_x
DeviceMatrix<Scalar> d_y;
d_y = d_A * d_x;
// Also test with noalias():
DeviceMatrix<Scalar> d_tmp;
d_tmp.noalias() = d_A * d_x;
Vec y_gpu = d_y.toHost(gpu_ctx.stream());
Vec tmp_gpu = d_tmp.toHost(gpu_ctx.stream());
Vec y_cpu = A * x;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
VERIFY((tmp_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}
// ---- deviceView overwrite: second view replaces first -----------------------
template <typename Scalar>
void test_deviceview_overwrite(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A1 = make_sparse<Scalar>(n, n);
SpMat A2 = make_sparse<Scalar>(n, n); // different random matrix
Vec x = Vec::Random(n);
GpuContext gpu_ctx;
GpuSparseContext<Scalar> ctx(gpu_ctx);
// First view: A1.
auto d_A1 = ctx.deviceView(A1);
auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
DeviceMatrix<Scalar> d_y1;
d_y1 = d_A1 * d_x;
Vec y1_gpu = d_y1.toHost(gpu_ctx.stream());
Vec y1_cpu = A1 * x;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((y1_gpu - y1_cpu).norm() / (y1_cpu.norm() + RealScalar(1)) < tol);
// Second view overwrites first: now uses A2.
auto d_A2 = ctx.deviceView(A2);
DeviceMatrix<Scalar> d_y2;
d_y2 = d_A2 * d_x;
Vec y2_gpu = d_y2.toHost(gpu_ctx.stream());
Vec y2_cpu = A2 * x;
VERIFY((y2_gpu - y2_cpu).norm() / (y2_cpu.norm() + RealScalar(1)) < tol);
}
// ---- Per-scalar driver ------------------------------------------------------ // ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar> template <typename Scalar>
@@ -193,6 +292,9 @@ void test_scalar() {
CALL_SUBTEST(test_identity<Scalar>(64)); CALL_SUBTEST(test_identity<Scalar>(64));
CALL_SUBTEST(test_reuse<Scalar>(64)); CALL_SUBTEST(test_reuse<Scalar>(64));
CALL_SUBTEST(test_empty<Scalar>()); CALL_SUBTEST(test_empty<Scalar>());
CALL_SUBTEST(test_spmv_device<Scalar>(64));
CALL_SUBTEST(test_spmv_expr<Scalar>(64));
CALL_SUBTEST(test_deviceview_overwrite<Scalar>(64));
} }
EIGEN_DECLARE_TEST(gpu_cusparse_spmv) { EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {

View File

@@ -12,6 +12,7 @@
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "main.h" #include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU> #include <Eigen/GPU>
using namespace Eigen; using namespace Eigen;
@@ -230,6 +231,217 @@ void test_scalar() {
CALL_SUBTEST(test_move_assign<Scalar>(64, 64)); CALL_SUBTEST(test_move_assign<Scalar>(64, 64));
} }
// ---- BLAS-1: dot product ----------------------------------------------------
template <typename Scalar>
void test_blas1(Index n) {
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
// All BLAS-1 ops share one GpuContext — same stream, zero event overhead.
GpuContext ctx;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
// dot
{
Vec a = Vec::Random(n);
Vec b = Vec::Random(n);
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
Scalar gpu_dot = d_a.dot(ctx, d_b);
Scalar cpu_dot = a.dot(b);
VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
}
// norm / squaredNorm
{
Vec a = Vec::Random(n);
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
RealScalar gpu_norm = d_a.norm(ctx);
RealScalar cpu_norm = a.norm();
VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol);
RealScalar gpu_sqnorm = d_a.squaredNorm(ctx);
RealScalar cpu_sqnorm = a.squaredNorm();
VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol);
}
// addScaled (axpy)
{
Vec x = Vec::Random(n);
Vec y = Vec::Random(n);
Scalar alpha(2.5);
Vec y_ref = y + alpha * x;
auto d_y = DeviceMatrix<Scalar>::fromHost(y, ctx.stream());
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
d_y.addScaled(ctx, alpha, d_x);
Vec y_gpu = d_y.toHost(ctx.stream());
VERIFY((y_gpu - y_ref).norm() < tol * y_ref.norm() + tol);
}
// scale (scal)
{
Vec x = Vec::Random(n);
Scalar alpha(3.0);
Vec x_ref = alpha * x;
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
d_x.scale(ctx, alpha);
Vec x_gpu = d_x.toHost(ctx.stream());
VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol);
}
// copyFrom
{
Vec x = Vec::Random(n);
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
DeviceMatrix<Scalar> d_y;
d_y.copyFrom(ctx, d_x);
Vec y = d_y.toHost(ctx.stream());
VERIFY_IS_APPROX(y, x);
}
// setZero
{
Vec x = Vec::Random(n);
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
d_x.setZero(ctx);
Vec result = d_x.toHost(ctx.stream());
VERIFY_IS_EQUAL(result, Vec::Zero(n));
}
}
// ---- BLAS-1 operator overloads (CG-style) -----------------------------------
template <typename Scalar>
void test_cg_operators(Index n) {
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
Vec x = Vec::Random(n);
Vec p = Vec::Random(n);
Vec tmp = Vec::Random(n);
Vec z = Vec::Random(n);
Scalar alpha(2.5);
Scalar beta(0.7);
// Test: x += alpha * p
{
Vec x_ref = x + alpha * p;
auto d_x = DeviceMatrix<Scalar>::fromHost(x);
auto d_p = DeviceMatrix<Scalar>::fromHost(p);
d_x += alpha * d_p;
Vec x_gpu = d_x.toHost();
VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol);
}
// Test: r -= alpha * tmp
{
Vec r = Vec::Random(n);
Vec r_ref = r - alpha * tmp;
auto d_r = DeviceMatrix<Scalar>::fromHost(r);
auto d_tmp = DeviceMatrix<Scalar>::fromHost(tmp);
d_r -= alpha * d_tmp;
Vec r_gpu = d_r.toHost();
VERIFY((r_gpu - r_ref).norm() < tol * r_ref.norm() + tol);
}
// Test: p = z + beta * p (cuBLAS geam)
{
Vec p_copy = p;
Vec p_ref = z + beta * p_copy;
auto d_p = DeviceMatrix<Scalar>::fromHost(p_copy);
auto d_z = DeviceMatrix<Scalar>::fromHost(z);
d_p = d_z + beta * d_p;
Vec p_gpu = d_p.toHost();
VERIFY((p_gpu - p_ref).norm() < tol * p_ref.norm() + tol);
}
// Test: operator+= and operator-= with DeviceMatrix (no scalar)
{
Vec a = Vec::Random(n);
Vec b = Vec::Random(n);
Vec a_ref = a + b;
auto d_a = DeviceMatrix<Scalar>::fromHost(a);
auto d_b = DeviceMatrix<Scalar>::fromHost(b);
d_a += d_b;
VERIFY((d_a.toHost() - a_ref).norm() < tol * a_ref.norm() + tol);
}
}
// ---- DeviceScalar: deferred sync -------------------------------------------
template <typename Scalar>
void test_device_scalar() {
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
const Index n = 256;
Vec a = Vec::Random(n);
Vec b = Vec::Random(n);
GpuContext ctx;
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
// dot() returns DeviceScalar — implicit conversion to Scalar syncs.
Scalar gpu_dot = d_a.dot(ctx, d_b);
Scalar cpu_dot = a.dot(b);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
// squaredNorm() returns host RealScalar directly (syncs internally).
RealScalar gpu_sqnorm = d_a.squaredNorm(ctx);
RealScalar cpu_sqnorm = a.squaredNorm();
VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol);
// norm() returns DeviceScalar<RealScalar> — implicit conversion syncs.
RealScalar gpu_norm = d_a.norm(ctx);
RealScalar cpu_norm = a.norm();
VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol);
// Convenience overloads (thread-local context).
GpuContext::setThreadLocal(&ctx);
Scalar gpu_dot2 = d_a.dot(d_b);
VERIFY(numext::abs(gpu_dot2 - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
GpuContext::setThreadLocal(nullptr);
// Empty vectors: dot and norm must return zero.
{
DeviceMatrix<Scalar> d_empty(0, 1);
DeviceMatrix<Scalar> d_empty2(0, 1);
Scalar empty_dot = d_empty.dot(ctx, d_empty2);
VERIFY_IS_EQUAL(empty_dot, Scalar(0));
RealScalar empty_sqnorm = d_empty.squaredNorm(ctx);
VERIFY_IS_EQUAL(empty_sqnorm, RealScalar(0));
RealScalar empty_norm = d_empty.norm(ctx);
VERIFY_IS_EQUAL(empty_norm, RealScalar(0));
}
}
// ---- cwiseProduct -----------------------------------------------------------
template <typename Scalar>
void test_cwiseProduct() {
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
const Index n = 256;
Vec a = Vec::Random(n);
Vec b = Vec::Random(n);
Vec ref = a.array() * b.array();
GpuContext ctx;
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
auto d_c = d_a.cwiseProduct(ctx, d_b);
Vec result = d_c.toHost(ctx.stream());
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((result - ref).norm() < tol * ref.norm() + tol);
}
EIGEN_DECLARE_TEST(gpu_device_matrix) { EIGEN_DECLARE_TEST(gpu_device_matrix) {
CALL_SUBTEST(test_default_construct()); CALL_SUBTEST(test_default_construct());
CALL_SUBTEST(test_empty()); CALL_SUBTEST(test_empty());
@@ -242,4 +454,18 @@ EIGEN_DECLARE_TEST(gpu_device_matrix) {
CALL_SUBTEST(test_scalar<double>()); CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>()); CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>()); CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_blas1<float>(256));
CALL_SUBTEST(test_blas1<double>(256));
CALL_SUBTEST(test_blas1<std::complex<float>>(256));
CALL_SUBTEST(test_blas1<std::complex<double>>(256));
CALL_SUBTEST(test_cg_operators<float>(256));
CALL_SUBTEST(test_cg_operators<double>(256));
CALL_SUBTEST(test_cg_operators<std::complex<float>>(256));
CALL_SUBTEST(test_cg_operators<std::complex<double>>(256));
CALL_SUBTEST(test_device_scalar<float>());
CALL_SUBTEST(test_device_scalar<double>());
CALL_SUBTEST(test_device_scalar<std::complex<float>>());
CALL_SUBTEST(test_device_scalar<std::complex<double>>());
CALL_SUBTEST(test_cwiseProduct<float>());
CALL_SUBTEST(test_cwiseProduct<double>());
} }