diff --git a/Eigen/GPU b/Eigen/GPU index b2aae8337..30630ac2e 100644 --- a/Eigen/GPU +++ b/Eigen/GPU @@ -39,6 +39,7 @@ #ifdef EIGEN_USE_GPU // IWYU pragma: begin_exports +#include "src/GPU/DeviceScalar.h" #include "src/GPU/DeviceMatrix.h" #include "src/GPU/GpuContext.h" #include "src/GPU/DeviceExpr.h" @@ -53,8 +54,10 @@ #include "src/GPU/CuFftSupport.h" #include "src/GPU/GpuFFT.h" #include "src/GPU/CuSparseSupport.h" +#ifdef EIGEN_SPARSECORE_MODULE_H #include "src/GPU/GpuSparseContext.h" -#ifdef EIGEN_CUDSS +#endif +#if defined(EIGEN_CUDSS) && defined(EIGEN_SPARSECORE_MODULE_H) #include "src/GPU/CuDssSupport.h" #include "src/GPU/GpuSparseSolverBase.h" #include "src/GPU/GpuSparseLLT.h" diff --git a/Eigen/src/GPU/CuBlasSupport.h b/Eigen/src/GPU/CuBlasSupport.h index c3ba916f5..fc64b04b5 100644 --- a/Eigen/src/GPU/CuBlasSupport.h +++ b/Eigen/src/GPU/CuBlasSupport.h @@ -21,6 +21,7 @@ #include "./GpuSupport.h" #include +#include namespace Eigen { namespace internal { @@ -50,14 +51,14 @@ constexpr cublasOperation_t to_cublas_op(GpuOp op) { } // ---- Scalar → cublasComputeType_t ------------------------------------------- -// cublasGemmEx requires a compute type (separate from the data type). +// cublasLtMatmul requires a compute type (separate from the data type). // // Precision policy: -// - Default: tensor core algorithms enabled (CUBLAS_GEMM_DEFAULT_TENSOR_OP). +// - Default: tensor core algorithms enabled via cublasLtMatmul heuristics. // For double, cuBLAS may use Ozaki emulation on sm_80+ tensor cores. // - EIGEN_CUDA_TF32: opt-in to TF32 for float (~2x faster, 10-bit mantissa). // - EIGEN_NO_CUDA_TENSOR_OPS: disables all tensor core usage. Uses pedantic -// compute types and CUBLAS_GEMM_DEFAULT algorithm. For bit-exact reproducibility. +// compute types. For bit-exact reproducibility. template struct cuda_compute_type; @@ -98,8 +99,47 @@ struct cuda_compute_type> { static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F; #endif }; -// ---- GEMM algorithm hint ---------------------------------------------------- +// ---- Alpha/beta scalar type for cublasLtMatmul ------------------------------ +// For standard types, alpha/beta match the scalar type. +template +struct cuda_gemm_scalar { + using type = Scalar; +}; + +// ---- cublasLt GEMM dispatch ------------------------------------------------- +// Wraps cublasLtMatmul with descriptor setup, heuristic algorithm selection, +// and lazy workspace management. Supports 64-bit dimensions natively. +// +// The workspace buffer (DeviceBuffer*) is grown lazily to match the selected +// algorithm's actual requirement. The heuristic is queried with a generous +// 32 MB cap so that the best algorithm is never excluded. Growth is monotonic: +// the buffer only grows, never shrinks, so reallocation happens at most a few +// times during the lifetime of the owning GpuContext or solver. +// +// EIGEN_NO_CUDA_TENSOR_OPS: pedantic compute types (CUBLAS_COMPUTE_32F_PEDANTIC, +// CUBLAS_COMPUTE_64F_PEDANTIC) prevent cublasLt from selecting tensor core +// algorithms, matching the previous cublasGemmEx behavior. +// +// Thread safety: the workspace buffer is not thread-safe. All GEMM calls +// sharing a workspace must be on the same CUDA stream (guaranteed by GpuContext's +// single-stream design and by each GpuSVD owning its own stream). +// +// Future optimization: for hot loops (e.g., CG iteration), caching descriptors +// and the selected algorithm by (m, n, k, dtype, transA, transB) would avoid +// per-call descriptor creation and heuristic lookup overhead. + +#define EIGEN_CUBLASLT_CHECK(expr) \ + do { \ + cublasStatus_t _s = (expr); \ + eigen_assert(_s == CUBLAS_STATUS_SUCCESS && "cuBLASLt call failed"); \ + } while (0) + +// Maximum workspace the heuristic is allowed to consider. This is a preference +// ceiling, not an allocation — actual allocation matches the selected algorithm. +static constexpr size_t kCublasLtMaxWorkspaceBytes = 32 * 1024 * 1024; // 32 MB + +// cublasGemmEx fallback algorithm hint (used when cublasLt heuristic returns no results). constexpr cublasGemmAlgo_t cuda_gemm_algo() { #ifdef EIGEN_NO_CUDA_TENSOR_OPS return CUBLAS_GEMM_DEFAULT; @@ -108,13 +148,73 @@ constexpr cublasGemmAlgo_t cuda_gemm_algo() { #endif } -// ---- Alpha/beta scalar type for cublasGemmEx -------------------------------- -// For standard types, alpha/beta match the scalar type. - template -struct cuda_gemm_scalar { - using type = Scalar; -}; +void cublaslt_gemm(cublasLtHandle_t lt_handle, cublasHandle_t cublas_handle, cublasOperation_t transA, + cublasOperation_t transB, int64_t m, int64_t n, int64_t k, + const typename cuda_gemm_scalar::type* alpha, const Scalar* A, int64_t lda, const Scalar* B, + int64_t ldb, const typename cuda_gemm_scalar::type* beta, Scalar* C, int64_t ldc, + DeviceBuffer* workspace, cudaStream_t stream) { + constexpr cudaDataType_t dtype = cuda_data_type::value; + constexpr cublasComputeType_t compute = cuda_compute_type::value; + using AlphaType = typename cuda_gemm_scalar::type; + constexpr cudaDataType_t alpha_type = cuda_data_type::value; + + // Matmul descriptor. + cublasLtMatmulDesc_t matmul_desc = nullptr; + EIGEN_CUBLASLT_CHECK(cublasLtMatmulDescCreate(&matmul_desc, compute, alpha_type)); + EIGEN_CUBLASLT_CHECK( + cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, sizeof(transA))); + EIGEN_CUBLASLT_CHECK( + cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, sizeof(transB))); + + // Matrix layout descriptors (column-major). + // Physical layout dimensions: rows × cols with leading dimension lda/ldb/ldc. + const int64_t a_rows = (transA == CUBLAS_OP_N) ? m : k; + const int64_t b_rows = (transB == CUBLAS_OP_N) ? k : n; + + cublasLtMatrixLayout_t layout_A = nullptr, layout_B = nullptr, layout_C = nullptr; + EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_A, dtype, a_rows, (transA == CUBLAS_OP_N) ? k : m, lda)); + EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_B, dtype, b_rows, (transB == CUBLAS_OP_N) ? n : k, ldb)); + EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_C, dtype, m, n, ldc)); + + // Heuristic selection: query with generous workspace cap, allocate only what's needed. + cublasLtMatmulPreference_t preference = nullptr; + EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceCreate(&preference)); + size_t max_ws = kCublasLtMaxWorkspaceBytes; + EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + &max_ws, sizeof(max_ws))); + + cublasLtMatmulHeuristicResult_t result; + int returned_results = 0; + cublasStatus_t heuristic_status = cublasLtMatmulAlgoGetHeuristic(lt_handle, matmul_desc, layout_A, layout_B, layout_C, + layout_C, preference, 1, &result, &returned_results); + + if (heuristic_status == CUBLAS_STATUS_SUCCESS && returned_results > 0) { + // cublasLt path: use the selected algorithm with lazy workspace. + const size_t needed = result.workspaceSize; + if (needed > workspace->size()) { + // Sync only when freeing an existing buffer that may be in use. + if (workspace->ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream)); + *workspace = DeviceBuffer(needed); + } + + EIGEN_CUBLASLT_CHECK(cublasLtMatmul(lt_handle, matmul_desc, alpha, A, layout_A, B, layout_B, beta, C, layout_C, C, + layout_C, &result.algo, workspace->ptr, needed, stream)); + } else { + // Fallback: cublasGemmEx for shapes/types that cublasLt cannot handle. + EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_handle, transA, transB, static_cast(m), static_cast(n), + static_cast(k), alpha, A, dtype, static_cast(lda), B, dtype, + static_cast(ldb), beta, C, dtype, static_cast(ldc), compute, + cuda_gemm_algo())); + } + + // Cleanup descriptors. + EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceDestroy(preference)); + EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_C)); + EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_B)); + EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_A)); + EIGEN_CUBLASLT_CHECK(cublasLtMatmulDescDestroy(matmul_desc)); +} // ---- Type-specific cuBLAS wrappers ------------------------------------------ // cuBLAS uses separate functions per type (Strsm, Dtrsm, etc.). @@ -227,6 +327,100 @@ inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cu reinterpret_cast(B), ldb, reinterpret_cast(C), ldc); } +// ---- cuBLAS Level-1 wrappers ------------------------------------------------ +// Type-dispatched wrappers for BLAS-1 vector operations: dot, axpy, nrm2, scal, copy. +// These work with CUBLAS_POINTER_MODE_HOST or CUBLAS_POINTER_MODE_DEVICE depending +// on the caller's configuration. For device pointer mode, scalar result pointers +// (dot, nrm2) must point to device memory. + +// dot: result = x^T * y (real) or x^H * y (complex conjugate dot) +inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const float* x, int incx, const float* y, int incy, + float* result) { + return cublasSdot(h, n, x, incx, y, incy, result); +} +inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const double* x, int incx, const double* y, int incy, + double* result) { + return cublasDdot(h, n, x, incx, y, incy, result); +} +inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const std::complex* x, int incx, + const std::complex* y, int incy, std::complex* result) { + return cublasCdotc(h, n, reinterpret_cast(x), incx, reinterpret_cast(y), incy, + reinterpret_cast(result)); +} +inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const std::complex* x, int incx, + const std::complex* y, int incy, std::complex* result) { + return cublasZdotc(h, n, reinterpret_cast(x), incx, + reinterpret_cast(y), incy, reinterpret_cast(result)); +} + +// nrm2: result = ||x||_2 (always returns real) +inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const float* x, int incx, float* result) { + return cublasSnrm2(h, n, x, incx, result); +} +inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const double* x, int incx, double* result) { + return cublasDnrm2(h, n, x, incx, result); +} +inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const std::complex* x, int incx, float* result) { + return cublasScnrm2(h, n, reinterpret_cast(x), incx, result); +} +inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const std::complex* x, int incx, double* result) { + return cublasDznrm2(h, n, reinterpret_cast(x), incx, result); +} + +// axpy: y += alpha * x +inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const float* alpha, const float* x, int incx, float* y, + int incy) { + return cublasSaxpy(h, n, alpha, x, incx, y, incy); +} +inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const double* alpha, const double* x, int incx, double* y, + int incy) { + return cublasDaxpy(h, n, alpha, x, incx, y, incy); +} +inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const std::complex* alpha, + const std::complex* x, int incx, std::complex* y, int incy) { + return cublasCaxpy(h, n, reinterpret_cast(alpha), reinterpret_cast(x), incx, + reinterpret_cast(y), incy); +} +inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const std::complex* alpha, + const std::complex* x, int incx, std::complex* y, int incy) { + return cublasZaxpy(h, n, reinterpret_cast(alpha), reinterpret_cast(x), + incx, reinterpret_cast(y), incy); +} + +// scal: x *= alpha +inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const float* alpha, float* x, int incx) { + return cublasSscal(h, n, alpha, x, incx); +} +inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const double* alpha, double* x, int incx) { + return cublasDscal(h, n, alpha, x, incx); +} +inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const std::complex* alpha, std::complex* x, + int incx) { + return cublasCscal(h, n, reinterpret_cast(alpha), reinterpret_cast(x), incx); +} +inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const std::complex* alpha, std::complex* x, + int incx) { + return cublasZscal(h, n, reinterpret_cast(alpha), reinterpret_cast(x), + incx); +} + +// copy: y = x +inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const float* x, int incx, float* y, int incy) { + return cublasScopy(h, n, x, incx, y, incy); +} +inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const double* x, int incx, double* y, int incy) { + return cublasDcopy(h, n, x, incx, y, incy); +} +inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const std::complex* x, int incx, + std::complex* y, int incy) { + return cublasCcopy(h, n, reinterpret_cast(x), incx, reinterpret_cast(y), incy); +} +inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const std::complex* x, int incx, + std::complex* y, int incy) { + return cublasZcopy(h, n, reinterpret_cast(x), incx, reinterpret_cast(y), + incy); +} + } // namespace internal } // namespace Eigen diff --git a/Eigen/src/GPU/DeviceDispatch.h b/Eigen/src/GPU/DeviceDispatch.h index c6f6e5abf..e061a8dc5 100644 --- a/Eigen/src/GPU/DeviceDispatch.h +++ b/Eigen/src/GPU/DeviceDispatch.h @@ -29,10 +29,11 @@ namespace Eigen { namespace internal { // ---- GEMM dispatch ---------------------------------------------------------- -// GemmExpr → cublasGemmEx(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) +// GemmExpr → cublasLtMatmul via GpuContext. // -// The generic API cublasGemmEx handles all scalar types (float, double, -// complex, complex) via cudaDataType_t. +// Uses cublasLtMatmul for 64-bit dimension support and heuristic algorithm +// selection. All scalar types (float, double, complex, complex) +// are handled via cudaDataType_t. template void dispatch_gemm( @@ -46,6 +47,10 @@ void dispatch_gemm( const DeviceMatrix& A = traits_lhs::matrix(expr.lhs()); const DeviceMatrix& B = traits_rhs::matrix(expr.rhs()); + // cuBLAS GEMM: C must not alias A or B (undefined behavior). + eigen_assert(dst.data() != A.data() && "GEMM: output aliases left operand (use a temporary)"); + eigen_assert(dst.data() != B.data() && "GEMM: output aliases right operand (use a temporary)"); + constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op); constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op); @@ -89,13 +94,8 @@ void dispatch_gemm( EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream())); } - constexpr cudaDataType_t dtype = cuda_data_type::value; - constexpr cublasComputeType_t compute = cuda_compute_type::value; - - EIGEN_CUBLAS_CHECK(cublasGemmEx(ctx.cublasHandle(), transA, transB, static_cast(m), static_cast(n), - static_cast(k), &alpha_gval, A.data(), dtype, static_cast(lda), B.data(), - dtype, static_cast(ldb), &beta_gval, dst.data(), dtype, static_cast(ldc), - compute, cuda_gemm_algo())); + cublaslt_gemm(ctx.cublasLtHandle(), ctx.cublasHandle(), transA, transB, m, n, k, &alpha_gval, A.data(), lda, + B.data(), ldb, &beta_gval, dst.data(), ldc, ctx.gemmWorkspace(), ctx.stream()); dst.recordReady(ctx.stream()); } @@ -504,6 +504,284 @@ void DeviceSelfAdjointView::rankUpdate(const DeviceMatrix +DeviceScalar::Scalar> DeviceMatrix::dot(GpuContext& ctx, + const DeviceMatrix& other) const { + const int n = static_cast(rows_ * cols_); + eigen_assert(n == static_cast(other.rows_ * other.cols_)); + DeviceScalar result(Scalar(0), ctx.stream()); + if (n > 0) { + waitReady(ctx.stream()); + other.waitReady(ctx.stream()); + cublasPointerMode_t prev; + EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev)); + EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE)); + EIGEN_CUBLAS_CHECK(internal::cublasXdot(ctx.cublasHandle(), n, data_, 1, other.data_, 1, result.devicePtr())); + EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev)); + } + return result; +} + +namespace internal { +// Real: dot(x,x) returns DeviceScalar which IS DeviceScalar. +// Move-construct without any sync. +template +typename std::enable_if::value, DeviceScalar>::type squaredNorm_from_dot( + DeviceScalar&& d, cudaStream_t) { + return std::move(d); +} +// Complex: must sync to extract the real part (DeviceScalar arithmetic is real-only). +template +typename std::enable_if::value, DeviceScalar>::type squaredNorm_from_dot( + DeviceScalar&& d, cudaStream_t stream) { + return DeviceScalar(numext::real(Scalar(d)), stream); +} +} // namespace internal + +template +DeviceScalar::Real> DeviceMatrix::squaredNorm(GpuContext& ctx) const { + // Use dot(x,x) instead of nrm2()^2: dot kernel is ~4.5x faster than nrm2 + // (nrm2 uses a numerically careful scaled-sum-of-squares algorithm that is + // unnecessary for CG convergence checks). + using RealScalar = typename NumTraits::Real; + return internal::squaredNorm_from_dot(dot(ctx, *this), ctx.stream()); +} + +template +DeviceScalar::Real> DeviceMatrix::norm(GpuContext& ctx) const { + using RealScalar = typename NumTraits::Real; + const int n = static_cast(rows_ * cols_); + DeviceScalar result(RealScalar(0), ctx.stream()); + if (n > 0) { + waitReady(ctx.stream()); + cublasPointerMode_t prev; + EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev)); + EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE)); + EIGEN_CUBLAS_CHECK(internal::cublasXnrm2(ctx.cublasHandle(), n, data_, 1, result.devicePtr())); + EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev)); + } + return result; +} + +template +void DeviceMatrix::setZero(GpuContext& ctx) { + if (sizeInBytes() > 0) { + waitReady(ctx.stream()); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(data_, 0, sizeInBytes(), ctx.stream())); + recordReady(ctx.stream()); + } +} + +template +void DeviceMatrix::addScaled(GpuContext& ctx, Scalar alpha, const DeviceMatrix& x) { + const int n = static_cast(rows_ * cols_); + eigen_assert(n == static_cast(x.rows_ * x.cols_)); + if (n > 0) { + waitReady(ctx.stream()); + x.waitReady(ctx.stream()); + EIGEN_CUBLAS_CHECK(internal::cublasXaxpy(ctx.cublasHandle(), n, &alpha, x.data_, 1, data_, 1)); + recordReady(ctx.stream()); + } +} + +template +void DeviceMatrix::scale(GpuContext& ctx, Scalar alpha) { + const int n = static_cast(rows_ * cols_); + if (n > 0) { + waitReady(ctx.stream()); + EIGEN_CUBLAS_CHECK(internal::cublasXscal(ctx.cublasHandle(), n, &alpha, data_, 1)); + recordReady(ctx.stream()); + } +} + +template +void DeviceMatrix::copyFrom(GpuContext& ctx, const DeviceMatrix& other) { + // Wait on *this before resize — resize may free the old buffer while another + // stream is still reading it. + if (!empty()) waitReady(ctx.stream()); + resize(other.rows_, other.cols_); + const int n = static_cast(rows_ * cols_); + if (n > 0) { + other.waitReady(ctx.stream()); + EIGEN_CUBLAS_CHECK(internal::cublasXcopy(ctx.cublasHandle(), n, other.data_, 1, data_, 1)); + recordReady(ctx.stream()); + } +} + +// ---- BLAS-1 operator overloads for CG compatibility ------------------------- + +// this += alpha * x (axpy) +template +DeviceMatrix& DeviceMatrix::operator+=(const DeviceScaled& expr) { + addScaled(GpuContext::threadLocal(), expr.scalar(), internal::device_expr_traits::matrix(expr.inner())); + return *this; +} + +// this -= alpha * x (axpy with negated alpha) +template +DeviceMatrix& DeviceMatrix::operator-=(const DeviceScaled& expr) { + addScaled(GpuContext::threadLocal(), -expr.scalar(), + internal::device_expr_traits::matrix(expr.inner())); + return *this; +} + +// this += x (axpy with alpha=1) +template +DeviceMatrix& DeviceMatrix::operator+=(const DeviceMatrix& other) { + Scalar one(1); + addScaled(GpuContext::threadLocal(), one, other); + return *this; +} + +// this -= x (axpy with alpha=-1) +template +DeviceMatrix& DeviceMatrix::operator-=(const DeviceMatrix& other) { + Scalar neg_one(-1); + addScaled(GpuContext::threadLocal(), neg_one, other); + return *this; +} + +// this *= alpha (scal, host pointer) +template +DeviceMatrix& DeviceMatrix::operator*=(Scalar alpha) { + scale(GpuContext::threadLocal(), alpha); + return *this; +} + +// this *= alpha (scal, device pointer — avoids host sync) +template +DeviceMatrix& DeviceMatrix::operator*=(const DeviceScalar& alpha) { + const int n = static_cast(rows_ * cols_); + if (n > 0) { + auto& ctx = GpuContext::threadLocal(); + waitReady(ctx.stream()); + cublasPointerMode_t prev; + EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev)); + EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE)); + EIGEN_CUBLAS_CHECK(internal::cublasXscal(ctx.cublasHandle(), n, alpha.devicePtr(), data_, 1)); + EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev)); + recordReady(ctx.stream()); + } + return *this; +} + +// this += DeviceScalar * x (axpy with CUBLAS_POINTER_MODE_DEVICE) +template +DeviceMatrix& DeviceMatrix::operator+=(const DeviceScaledDevice& expr) { + const int n = static_cast(rows_ * cols_); + const auto& x = expr.matrix(); + eigen_assert(n == static_cast(x.rows_ * x.cols_)); + if (n > 0) { + auto& ctx = GpuContext::threadLocal(); + waitReady(ctx.stream()); + x.waitReady(ctx.stream()); + cublasPointerMode_t prev; + EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev)); + EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE)); + EIGEN_CUBLAS_CHECK(internal::cublasXaxpy(ctx.cublasHandle(), n, expr.alpha().devicePtr(), x.data_, 1, data_, 1)); + EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev)); + recordReady(ctx.stream()); + } + return *this; +} + +// this -= DeviceScalar * x (axpy with negated device scalar) +template +DeviceMatrix& DeviceMatrix::operator-=(const DeviceScaledDevice& expr) { + auto neg_alpha = -expr.alpha(); + DeviceScaledDevice neg_expr(neg_alpha, expr.matrix()); + return operator+=(neg_expr); +} + +// this = alpha * A + beta * B (cuBLAS geam) +template +DeviceMatrix& DeviceMatrix::operator=(const DeviceAddExpr& expr) { + auto& ctx = GpuContext::threadLocal(); + const auto& A = expr.A(); + const auto& B = expr.B(); + eigen_assert(A.rows() == B.rows() && A.cols() == B.cols()); + const int m = static_cast(A.rows()); + const int n = static_cast(A.cols()); + // Wait on *this before resize — resize may free the old buffer while another + // stream is still reading it. + if (!empty()) waitReady(ctx.stream()); + resize(A.rows(), A.cols()); + if (m > 0 && n > 0) { + A.waitReady(ctx.stream()); + B.waitReady(ctx.stream()); + Scalar_ alpha = expr.alpha(); + Scalar_ beta = expr.beta(); + EIGEN_CUBLAS_CHECK(internal::cublasXgeam(ctx.cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, &alpha, A.data(), m, + &beta, B.data(), m, data_, m)); + recordReady(ctx.stream()); + } + return *this; +} + +// cwiseProduct via NPP nppsMul (allocating). +template +DeviceMatrix DeviceMatrix::cwiseProduct(GpuContext& ctx, const DeviceMatrix& other) const { + const int n = static_cast(rows_ * cols_); + eigen_assert(n == static_cast(other.rows_ * other.cols_)); + DeviceMatrix result(rows_, cols_); + if (n > 0) { + waitReady(ctx.stream()); + other.waitReady(ctx.stream()); + internal::device_cwiseProduct(data_, other.data_, result.data_, n, ctx.stream()); + result.recordReady(ctx.stream()); + } + return result; +} + +// In-place cwiseProduct: this = a .* b (reuses this buffer, no allocation). +template +void DeviceMatrix::cwiseProduct(GpuContext& ctx, const DeviceMatrix& a, const DeviceMatrix& b) { + const int n = static_cast(a.rows_ * a.cols_); + eigen_assert(n == static_cast(b.rows_ * b.cols_)); + if (!empty()) waitReady(ctx.stream()); + resize(a.rows_, a.cols_); + if (n > 0) { + a.waitReady(ctx.stream()); + b.waitReady(ctx.stream()); + internal::device_cwiseProduct(a.data_, b.data_, data_, n, ctx.stream()); + recordReady(ctx.stream()); + } +} + +// Convenience overloads using thread-local default GpuContext. +template +DeviceScalar::Scalar> DeviceMatrix::dot(const DeviceMatrix& other) const { + return dot(GpuContext::threadLocal(), other); +} + +template +DeviceScalar::Real> DeviceMatrix::squaredNorm() const { + return squaredNorm(GpuContext::threadLocal()); +} + +template +DeviceScalar::Real> DeviceMatrix::norm() const { + return norm(GpuContext::threadLocal()); +} + +template +void DeviceMatrix::setZero() { + setZero(GpuContext::threadLocal()); +} + } // namespace Eigen #endif // EIGEN_GPU_DEVICE_DISPATCH_H diff --git a/Eigen/src/GPU/DeviceExpr.h b/Eigen/src/GPU/DeviceExpr.h index 03f36cdd1..254aa12ab 100644 --- a/Eigen/src/GPU/DeviceExpr.h +++ b/Eigen/src/GPU/DeviceExpr.h @@ -219,6 +219,87 @@ DeviceScaled> operator*(S alpha, const DeviceTransposeVie return {alpha, m}; } +// ---- DeviceScaledDevice: DeviceScalar * DeviceMatrix → device-pointer axpy --- +// Like DeviceScaled but carries a DeviceScalar (device pointer) instead of +// a host scalar. operator+= dispatches to cuBLAS axpy with POINTER_MODE_DEVICE. + +template +class DeviceScaledDevice { + public: + using Scalar = Scalar_; + DeviceScaledDevice(const DeviceScalar& alpha, const DeviceMatrix& mat) : alpha_(alpha), mat_(mat) {} + const DeviceScalar& alpha() const { return alpha_; } + const DeviceMatrix& matrix() const { return mat_; } + + private: + const DeviceScalar& alpha_; + const DeviceMatrix& mat_; +}; + +// DeviceScalar * DeviceMatrix → DeviceScaledDevice +template +DeviceScaledDevice operator*(const DeviceScalar& alpha, const DeviceMatrix& m) { + return {alpha, m}; +} + +// ---- DeviceAddExpr: a + b → cublasXgeam ------------------------------------- +// Captures `DeviceMatrix + DeviceScaled` (and reverse). +// Dispatched to geam: C = alpha * A + beta * B. +// +// Note: These operator+/- overloads are intentionally free functions on +// DeviceMatrix, not Eigen expression templates. DeviceMatrix does not inherit +// from MatrixBase, so there is no ambiguity with Eigen's own operator+/-. +// If DeviceMatrix is ever made an Eigen expression type, these would need to +// be revisited. + +template +class DeviceAddExpr { + public: + using Scalar = Scalar_; + DeviceAddExpr(Scalar alpha, const DeviceMatrix& A, Scalar beta, const DeviceMatrix& B) + : alpha_(alpha), A_(A), beta_(beta), B_(B) {} + Scalar alpha() const { return alpha_; } + Scalar beta() const { return beta_; } + const DeviceMatrix& A() const { return A_; } + const DeviceMatrix& B() const { return B_; } + + private: + Scalar alpha_; + const DeviceMatrix& A_; + Scalar beta_; + const DeviceMatrix& B_; +}; + +// DeviceMatrix + DeviceMatrix → DeviceAddExpr (alpha=1, beta=1) +template +DeviceAddExpr operator+(const DeviceMatrix& a, const DeviceMatrix& b) { + return {S(1), a, S(1), b}; +} + +// DeviceMatrix + DeviceScaled → DeviceAddExpr (alpha=1, beta=scaled) +template +DeviceAddExpr operator+(const DeviceMatrix& a, const DeviceScaled>& b) { + return {S(1), a, b.scalar(), b.inner()}; +} + +// DeviceScaled + DeviceMatrix → DeviceAddExpr (alpha=scaled, beta=1) +template +DeviceAddExpr operator+(const DeviceScaled>& a, const DeviceMatrix& b) { + return {a.scalar(), a.inner(), S(1), b}; +} + +// DeviceMatrix - DeviceMatrix → DeviceAddExpr (alpha=1, beta=-1) +template +DeviceAddExpr operator-(const DeviceMatrix& a, const DeviceMatrix& b) { + return {S(1), a, S(-1), b}; +} + +// DeviceMatrix - DeviceScaled → DeviceAddExpr (alpha=1, beta=-scaled) +template +DeviceAddExpr operator-(const DeviceMatrix& a, const DeviceScaled>& b) { + return {S(1), a, -b.scalar(), b.inner()}; +} + } // namespace Eigen #endif // EIGEN_GPU_DEVICE_EXPR_H diff --git a/Eigen/src/GPU/DeviceMatrix.h b/Eigen/src/GPU/DeviceMatrix.h index 2f999954f..3209d7925 100644 --- a/Eigen/src/GPU/DeviceMatrix.h +++ b/Eigen/src/GPU/DeviceMatrix.h @@ -53,6 +53,16 @@ template class DeviceAssignment; template class GemmExpr; +template +class DeviceScaled; +template +class SpMVExpr; +template +class DeviceAddExpr; +template +class DeviceScaledDevice; +template +class DeviceScalar; template class LltSolveExpr; template @@ -170,6 +180,8 @@ template class DeviceMatrix { public: using Scalar = Scalar_; + using RealScalar = typename NumTraits::Real; + using PlainObject = DeviceMatrix; // owning type (for CG template compatibility) using PlainMatrix = Matrix; // ---- Construction / destruction ------------------------------------------ @@ -177,6 +189,16 @@ class DeviceMatrix { /** Default: empty (0x0, no allocation). */ DeviceMatrix() = default; + /** Allocate uninitialized column vector of given size. + * Matches Matrix(n) for CG template compatibility. */ + explicit DeviceMatrix(Index n) : rows_(n), cols_(1) { + eigen_assert(n >= 0); + size_t bytes = sizeInBytes(); + if (bytes > 0) { + EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast(&data_), bytes)); + } + } + /** Allocate uninitialized device memory for a rows x cols matrix. */ DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols) { eigen_assert(rows >= 0 && cols >= 0); @@ -446,6 +468,104 @@ class DeviceMatrix { template DeviceMatrix& operator=(const SymmExpr& expr); + // ---- BLAS Level-1 operations ---------------------------------------------- + // DeviceMatrix is always dense (lda == rows), so a vector is simply a + // DeviceMatrix with cols == 1. These BLAS-1 methods operate on the flat + // rows*cols element array, making them work for both vectors and matrices. + // + // All methods take an explicit GpuContext& for stream/handle control. + // When everything uses the same context, event waits are skipped (same-stream). + // Defined out-of-line in DeviceDispatch.h (needs GpuContext). + + /** Dot product: this^H * other. Returns DeviceScalar — the result stays + * on device until read via implicit conversion to Scalar (which syncs). + * When used with `auto`, no sync occurs until the value is needed. */ + DeviceScalar dot(GpuContext& ctx, const DeviceMatrix& other) const; + + /** Squared L2 norm via dot(x, x). Returns DeviceScalar (no sync until read). + * For real types, the result stays on device. For complex types, falls back + * to host sync (DeviceScalar arithmetic is real-only). */ + DeviceScalar::Real> squaredNorm(GpuContext& ctx) const; + + /** L2 norm. Returns DeviceScalar (no host sync). */ + DeviceScalar::Real> norm(GpuContext& ctx) const; + + /** Set all elements to zero. */ + void setZero(GpuContext& ctx); + + /** this += alpha * x (cuBLAS axpy). Requires same total size. */ + void addScaled(GpuContext& ctx, Scalar alpha, const DeviceMatrix& x); + + /** this *= alpha (cuBLAS scal). */ + void scale(GpuContext& ctx, Scalar alpha); + + /** Deep copy: this = other (cuBLAS copy). Resizes if needed. */ + void copyFrom(GpuContext& ctx, const DeviceMatrix& other); + + // Convenience overloads using the thread-local default GpuContext. + DeviceScalar dot(const DeviceMatrix& other) const; + DeviceScalar::Real> squaredNorm() const; + DeviceScalar::Real> norm() const; + void setZero(); + + // ---- BLAS-1 operator overloads for CG/iterative solver compatibility ------ + // These allow CG code like `x += alpha * p` to work with DeviceMatrix. + // `alpha * DeviceMatrix` already returns `DeviceScaled>` + // (defined in DeviceExpr.h). These operators dispatch to cuBLAS axpy/scal. + // Defined out-of-line in DeviceDispatch.h. + + /** this += alpha * x (cuBLAS axpy). For `x += alpha * p`. */ + DeviceMatrix& operator+=(const DeviceScaled& expr); + + /** this -= alpha * x (cuBLAS axpy with negated alpha). For `r -= alpha * tmp`. */ + DeviceMatrix& operator-=(const DeviceScaled& expr); + + /** this += x (cuBLAS axpy with alpha=1). */ + DeviceMatrix& operator+=(const DeviceMatrix& other); + + /** this -= x (cuBLAS axpy with alpha=-1). */ + DeviceMatrix& operator-=(const DeviceMatrix& other); + + /** this *= alpha (cuBLAS scal, host pointer mode). For `p *= beta`. */ + DeviceMatrix& operator*=(Scalar alpha); + + /** this *= alpha (cuBLAS scal, device pointer mode). Avoids host sync. */ + DeviceMatrix& operator*=(const DeviceScalar& alpha); + + /** Element-wise product: result[i] = this[i] * other[i] (NPP nppsMul). + * Returns a new DeviceMatrix. Defined out-of-line in DeviceDispatch.h. */ + DeviceMatrix cwiseProduct(GpuContext& ctx, const DeviceMatrix& other) const; + + /** In-place element-wise product: this[i] = a[i] * b[i] (NPP nppsMul). + * Reuses this matrix's buffer when sizes match, avoiding cudaMalloc. */ + void cwiseProduct(GpuContext& ctx, const DeviceMatrix& a, const DeviceMatrix& b); + + /** this += DeviceScalar * x (cuBLAS axpy with POINTER_MODE_DEVICE). */ + DeviceMatrix& operator+=(const DeviceScaledDevice& expr); + + /** this -= DeviceScalar * x (cuBLAS axpy with negated device scalar). */ + DeviceMatrix& operator-=(const DeviceScaledDevice& expr); + + /** Assign from an SpMV expression: d_y = d_A * d_x. */ + DeviceMatrix& operator=(const SpMVExpr& expr); + + /** Assign from an add expression: d_C = alpha * d_A + beta * d_B (cuBLAS geam). */ + DeviceMatrix& operator=(const DeviceAddExpr& expr); + + /** No-op — all DeviceMatrix operations are implicitly noalias. + * + * Unlike Eigen's Matrix, where omitting .noalias() triggers a copy to a + * temporary for safety, DeviceMatrix dispatches directly to NVIDIA library + * calls which have no built-in aliasing protection. Every assignment + * (`d_C = d_A * d_B`, `d_y = d_A * d_x`, etc.) behaves as if .noalias() + * were specified. The caller must ensure operands don't alias the + * destination for GEMM and SpMV. geam (`d_C = d_A + alpha * d_B`) is + * safe with aliasing. Debug asserts catch violations. + * + * This method exists so that `tmp.noalias() = mat * p` compiles for both + * Matrix and DeviceMatrix. */ + DeviceMatrix& noalias() { return *this; } + private: // ---- Private: adopt a raw device pointer (used by friend solvers) -------- diff --git a/Eigen/src/GPU/DeviceScalar.h b/Eigen/src/GPU/DeviceScalar.h new file mode 100644 index 000000000..4bdc3a0a9 --- /dev/null +++ b/Eigen/src/GPU/DeviceScalar.h @@ -0,0 +1,121 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// Device-resident scalar for deferred host synchronization. +// +// DeviceScalar wraps a single value in device memory. Reductions +// (dot, nrm2) write results directly to device memory via +// CUBLAS_POINTER_MODE_DEVICE, deferring host sync until the value is read. +// +// Implicit conversion to Scalar triggers cudaStreamSynchronize + download. +// In CG, this reduces 3 syncs/iter to effectively 1: the first conversion +// syncs the stream, subsequent conversions in the same expression just +// download (the stream is already flushed). +// +// Usage: +// auto dot_val = d_x.dot(d_y); // DeviceScalar, no sync +// auto norm_val = d_r.squaredNorm(); // DeviceScalar, no sync +// Scalar alpha = absNew / dot_val; // sync here (both values downloaded) +// d_x += alpha * d_p; // host-scalar axpy (as before) + +#ifndef EIGEN_GPU_DEVICE_SCALAR_H +#define EIGEN_GPU_DEVICE_SCALAR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./GpuSupport.h" +#include "./DeviceScalarOps.h" + +namespace Eigen { + +template +class DeviceScalar { + public: + using Scalar = Scalar_; + + /** Allocate uninitialized device scalar. Contents are undefined until written + * (e.g., by cuBLAS dot/nrm2 with POINTER_MODE_DEVICE). Consistent with + * DeviceMatrix(rows, cols) which also does not zero-initialize. */ + explicit DeviceScalar(cudaStream_t stream = nullptr) : d_val_(sizeof(Scalar)), stream_(stream) {} + + DeviceScalar(Scalar host_val, cudaStream_t stream) : d_val_(sizeof(Scalar)), stream_(stream) { + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_val_.ptr, &host_val, sizeof(Scalar), cudaMemcpyHostToDevice, stream_)); + } + + DeviceScalar(DeviceScalar&& o) noexcept : d_val_(std::move(o.d_val_)), stream_(o.stream_) { o.stream_ = nullptr; } + + DeviceScalar& operator=(DeviceScalar&& o) noexcept { + if (this != &o) { + d_val_ = std::move(o.d_val_); + stream_ = o.stream_; + o.stream_ = nullptr; + } + return *this; + } + + DeviceScalar(const DeviceScalar&) = delete; + DeviceScalar& operator=(const DeviceScalar&) = delete; + + /** Download from device. Synchronizes the stream on first call; + * subsequent calls in the same expression are cheap (stream already flushed). */ + Scalar get() const { + Scalar result; + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&result, d_val_.ptr, sizeof(Scalar), cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + return result; + } + + /** Implicit conversion — allows `Scalar alpha = deviceScalar` and + * `if (deviceScalar < threshold)`. Triggers sync. */ + operator Scalar() const { return get(); } + + Scalar* devicePtr() { return static_cast(d_val_.ptr); } + const Scalar* devicePtr() const { return static_cast(d_val_.ptr); } + cudaStream_t stream() const { return stream_; } + + // ---- Device-side arithmetic (no host sync) --------------------------------- + // Uses NPP from DeviceScalarOps.h. All results stay on device. + // Currently supports real types only (float, double). Complex types + // fall back to implicit conversion (host sync) for division. + // + // Note: DeviceScalar has no cross-stream readiness tracking. All + // operations must be on the same CUDA stream. This is the natural + // pattern in iterative solvers where one GpuContext owns all work. + + friend DeviceScalar operator/(const DeviceScalar& a, const DeviceScalar& b) { + DeviceScalar result(a.stream_); + internal::device_scalar_div(a.devicePtr(), b.devicePtr(), result.devicePtr(), a.stream_); + return result; + } + + friend DeviceScalar operator/(Scalar a, const DeviceScalar& b) { + DeviceScalar d_a(a, b.stream_); + return d_a / b; + } + + friend DeviceScalar operator/(const DeviceScalar& a, Scalar b) { + DeviceScalar d_b(b, a.stream_); + return a / d_b; + } + + DeviceScalar operator-() const { + DeviceScalar result(stream_); + internal::device_scalar_neg(devicePtr(), result.devicePtr(), stream_); + return result; + } + + private: + internal::DeviceBuffer d_val_; + cudaStream_t stream_ = nullptr; +}; + +} // namespace Eigen + +#endif // EIGEN_GPU_DEVICE_SCALAR_H diff --git a/Eigen/src/GPU/DeviceScalarOps.h b/Eigen/src/GPU/DeviceScalarOps.h new file mode 100644 index 000000000..c668e4b7d --- /dev/null +++ b/Eigen/src/GPU/DeviceScalarOps.h @@ -0,0 +1,117 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// Device-resident scalar and element-wise operations via NPP signals. +// Header-only — no custom CUDA kernels needed. Uses nppsDiv, nppsMul, +// nppsMulC from the NPP library (CUDA::npps, part of the CUDA toolkit). + +#ifndef EIGEN_GPU_DEVICE_SCALAR_OPS_H +#define EIGEN_GPU_DEVICE_SCALAR_OPS_H + +#include +#include + +namespace Eigen { +namespace internal { + +// ---- NppStreamContext helper ------------------------------------------------ + +inline NppStreamContext make_npp_stream_ctx(cudaStream_t stream) { + // Cache device attributes (constant for process lifetime) in a thread-local. + // Only the stream and its flags vary per call. + struct CachedDeviceInfo { + bool initialized = false; + int device_id = 0; + int cc_major = 0; + int cc_minor = 0; + int mp_count = 0; + int max_threads_per_mp = 0; + int max_threads_per_block = 0; + int shared_mem_per_block = 0; + + void init() { + if (initialized) return; + cudaGetDevice(&device_id); + cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device_id); + cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device_id); + cudaDeviceGetAttribute(&mp_count, cudaDevAttrMultiProcessorCount, device_id); + cudaDeviceGetAttribute(&max_threads_per_mp, cudaDevAttrMaxThreadsPerMultiProcessor, device_id); + cudaDeviceGetAttribute(&max_threads_per_block, cudaDevAttrMaxThreadsPerBlock, device_id); + cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id); + initialized = true; + } + }; + thread_local CachedDeviceInfo cached; + cached.init(); + + NppStreamContext ctx = {}; + ctx.hStream = stream; + ctx.nCudaDeviceId = cached.device_id; + ctx.nCudaDevAttrComputeCapabilityMajor = cached.cc_major; + ctx.nCudaDevAttrComputeCapabilityMinor = cached.cc_minor; + ctx.nMultiProcessorCount = cached.mp_count; + ctx.nMaxThreadsPerMultiProcessor = cached.max_threads_per_mp; + ctx.nMaxThreadsPerBlock = cached.max_threads_per_block; + ctx.nSharedMemPerBlock = cached.shared_mem_per_block; + cudaStreamGetFlags(stream, &ctx.nStreamFlags); + return ctx; +} + +// ---- Scalar division: c = a / b (device-resident, async) -------------------- + +inline void device_scalar_div(const float* a, const float* b, float* c, cudaStream_t stream) { + NppStreamContext npp_ctx = make_npp_stream_ctx(stream); + nppsDiv_32f_Ctx(b, a, c, 1, npp_ctx); // NPP: pDst[i] = pSrc2[i] / pSrc1[i] +} + +inline void device_scalar_div(const double* a, const double* b, double* c, cudaStream_t stream) { + NppStreamContext npp_ctx = make_npp_stream_ctx(stream); + nppsDiv_64f_Ctx(b, a, c, 1, npp_ctx); // NPP: pDst[i] = pSrc2[i] / pSrc1[i] +} + +// ---- Scalar negation: c = -a (device-resident, async) ----------------------- + +inline void device_scalar_neg(const float* a, float* c, cudaStream_t stream) { + NppStreamContext npp_ctx = make_npp_stream_ctx(stream); + nppsMulC_32f_Ctx(a, -1.0f, c, 1, npp_ctx); +} + +inline void device_scalar_neg(const double* a, double* c, cudaStream_t stream) { + NppStreamContext npp_ctx = make_npp_stream_ctx(stream); + nppsMulC_64f_Ctx(a, -1.0, c, 1, npp_ctx); +} + +// ---- Element-wise vector multiply: c[i] = a[i] * b[i] ---------------------- + +inline void device_cwiseProduct(const float* a, const float* b, float* c, int n, cudaStream_t stream) { + NppStreamContext npp_ctx = make_npp_stream_ctx(stream); + nppsMul_32f_Ctx(a, b, c, static_cast(n), npp_ctx); +} + +inline void device_cwiseProduct(const double* a, const double* b, double* c, int n, cudaStream_t stream) { + NppStreamContext npp_ctx = make_npp_stream_ctx(stream); + nppsMul_64f_Ctx(a, b, c, static_cast(n), npp_ctx); +} + +// ---- Element-wise vector division: c[i] = a[i] / b[i] ---------------------- + +inline void device_cwiseQuotient(const float* a, const float* b, float* c, int n, cudaStream_t stream) { + NppStreamContext npp_ctx = make_npp_stream_ctx(stream); + nppsDiv_32f_Ctx(b, a, c, static_cast(n), npp_ctx); // NPP: dst = src2 / src1 +} + +inline void device_cwiseQuotient(const double* a, const double* b, double* c, int n, cudaStream_t stream) { + NppStreamContext npp_ctx = make_npp_stream_ctx(stream); + nppsDiv_64f_Ctx(b, a, c, static_cast(n), npp_ctx); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_GPU_DEVICE_SCALAR_OPS_H diff --git a/Eigen/src/GPU/GpuContext.h b/Eigen/src/GPU/GpuContext.h index f5da2451f..2e7a48bc2 100644 --- a/Eigen/src/GPU/GpuContext.h +++ b/Eigen/src/GPU/GpuContext.h @@ -28,6 +28,8 @@ #include "./CuBlasSupport.h" #include "./CuSolverSupport.h" +#include +#include namespace Eigen { @@ -44,38 +46,92 @@ namespace Eigen { */ class GpuContext { public: - GpuContext() { + /** Create a new context with a dedicated CUDA stream. */ + GpuContext() : owns_stream_(true) { EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); - EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_)); - EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_)); - EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_)); - EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_)); + init_handles(); } + /** Create a context on an existing stream (e.g., stream 0 = nullptr). + * The caller retains ownership of the stream — this context will not destroy it. */ + explicit GpuContext(cudaStream_t stream) : stream_(stream), owns_stream_(false) { init_handles(); } + ~GpuContext() { + if (cusparse_) (void)cusparseDestroy(cusparse_); if (cusolver_) (void)cusolverDnDestroy(cusolver_); + if (cublas_lt_) (void)cublasLtDestroy(cublas_lt_); if (cublas_) (void)cublasDestroy(cublas_); - if (stream_) (void)cudaStreamDestroy(stream_); + if (owns_stream_ && stream_) (void)cudaStreamDestroy(stream_); } // Non-copyable, non-movable (owns library handles). GpuContext(const GpuContext&) = delete; GpuContext& operator=(const GpuContext&) = delete; - /** Lazily-created thread-local default context. */ + /** Get the thread-local default context. + * If setThreadLocal() has been called, returns that context. + * Otherwise lazily creates a new context with a dedicated stream. */ static GpuContext& threadLocal() { + GpuContext* override = tl_override_ptr(); + if (override) return *override; thread_local GpuContext ctx; return ctx; } + /** Override the thread-local default context for this thread. + * The caller retains ownership of \p ctx — it must outlive all uses. + * Pass nullptr to restore the lazily-created default. */ + static void setThreadLocal(GpuContext* ctx) { tl_override_ptr() = ctx; } + cudaStream_t stream() const { return stream_; } cublasHandle_t cublasHandle() const { return cublas_; } cusolverDnHandle_t cusolverHandle() const { return cusolver_; } + /** cuBLASLt handle (lazy-initialized on first GEMM call). */ + cublasLtHandle_t cublasLtHandle() const { + if (!cublas_lt_) { + EIGEN_CUBLAS_CHECK(cublasLtCreate(&cublas_lt_)); + } + return cublas_lt_; + } + + /** Workspace buffer for cublasLtMatmul (grown lazily by cublaslt_gemm). + * Not thread-safe — all GEMM calls must be on this context's stream. */ + internal::DeviceBuffer* gemmWorkspace() const { return &gemm_workspace_; } + + /** cuSPARSE handle (lazy-initialized on first call). */ + cusparseHandle_t cusparseHandle() const { + if (!cusparse_) { + cusparseStatus_t s1 = cusparseCreate(&cusparse_); + eigen_assert(s1 == CUSPARSE_STATUS_SUCCESS && "cusparseCreate failed"); + EIGEN_UNUSED_VARIABLE(s1); + cusparseStatus_t s2 = cusparseSetStream(cusparse_, stream_); + eigen_assert(s2 == CUSPARSE_STATUS_SUCCESS && "cusparseSetStream failed"); + EIGEN_UNUSED_VARIABLE(s2); + } + return cusparse_; + } + private: cudaStream_t stream_ = nullptr; cublasHandle_t cublas_ = nullptr; cusolverDnHandle_t cusolver_ = nullptr; + mutable cublasLtHandle_t cublas_lt_ = nullptr; // lazy + mutable cusparseHandle_t cusparse_ = nullptr; // lazy + mutable internal::DeviceBuffer gemm_workspace_; // lazy + bool owns_stream_ = true; + + static GpuContext*& tl_override_ptr() { + thread_local GpuContext* ptr = nullptr; + return ptr; + } + + void init_handles() { + EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_)); + EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_)); + EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_)); + EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_)); + } }; } // namespace Eigen diff --git a/Eigen/src/GPU/GpuSVD.h b/Eigen/src/GPU/GpuSVD.h index 693156551..220025db8 100644 --- a/Eigen/src/GPU/GpuSVD.h +++ b/Eigen/src/GPU/GpuSVD.h @@ -18,6 +18,7 @@ // GpuSVD svd(A, ComputeThinU | ComputeThinV); // VectorXd S = svd.singularValues(); // MatrixXd U = svd.matrixU(); // m×k or m×m +// MatrixXd V = svd.matrixV(); // n×k or n×n (matches JacobiSVD) // MatrixXd VT = svd.matrixVT(); // k×n or n×n (this is V^T) // MatrixXd X = svd.solve(B); // pseudoinverse // MatrixXd X = svd.solve(B, k); // truncated (top k triplets) @@ -53,6 +54,7 @@ class GpuSVD { ~GpuSVD() { if (handle_) (void)cusolverDnDestroy(handle_); + if (cublas_lt_) (void)cublasLtDestroy(cublas_lt_); if (cublas_) (void)cublasDestroy(cublas_); if (stream_) (void)cudaStreamDestroy(stream_); } @@ -185,6 +187,10 @@ class GpuSVD { } } + /** Right singular vectors V. Returns n_orig × k or n_orig × n_orig. + * Equivalent to matrixVT().adjoint(). Matches Eigen's JacobiSVD::matrixV() API. */ + PlainMatrix matrixV() const { return matrixVT().adjoint(); } + /** Right singular vectors transposed V^T. Returns k × n_orig or n_orig × n_orig. * For transposed case, VT comes from cuSOLVER's U. */ PlainMatrix matrixVT() const { @@ -254,6 +260,8 @@ class GpuSVD { cudaStream_t stream_ = nullptr; cusolverDnHandle_t handle_ = nullptr; cublasHandle_t cublas_ = nullptr; + cublasLtHandle_t cublas_lt_ = nullptr; + mutable internal::DeviceBuffer gemm_workspace_; internal::CusolverParams params_; internal::DeviceBuffer d_A_; // working copy of A (overwritten by gesvd) internal::DeviceBuffer d_U_; // left singular vectors @@ -277,6 +285,7 @@ class GpuSVD { EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_)); EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_)); EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_)); + EIGEN_CUBLASLT_CHECK(cublasLtCreate(&cublas_lt_)); ensure_scratch(0); } @@ -410,23 +419,21 @@ class GpuSVD { internal::DeviceBuffer d_tmp(static_cast(kk) * static_cast(nrhs) * sizeof(Scalar)); { Scalar alpha_one(1), beta_zero(0); - constexpr cudaDataType_t dtype = internal::cuda_data_type::value; - constexpr cublasComputeType_t compute = internal::cuda_compute_type::value; if (!transposed_) { // U_stored^H * B: (m_×kk)^H × (m_×nrhs) → kk×nrhs. - EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast(kk), static_cast(nrhs), - static_cast(m_), &alpha_one, d_U_.ptr, dtype, static_cast(m_), - d_B.ptr, dtype, static_cast(m_orig), &beta_zero, d_tmp.ptr, dtype, - static_cast(kk), compute, internal::cuda_gemm_algo())); + internal::cublaslt_gemm(cublas_lt_, cublas_, CUBLAS_OP_C, CUBLAS_OP_N, kk, nrhs, m_, &alpha_one, + static_cast(d_U_.ptr), m_, static_cast(d_B.ptr), + m_orig, &beta_zero, static_cast(d_tmp.ptr), kk, &gemm_workspace_, + stream_); } else { // VT_stored * B: VT_stored is vtrows×n_ = kk×m_orig (thin), NoTrans. // vtrows×m_orig times m_orig×nrhs → vtrows×nrhs. Use first kk rows. const Index vtrows_stored = (swap_uv_options(options_) & ComputeFullV) ? n_ : k; - EIGEN_CUBLAS_CHECK(cublasGemmEx( - cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast(kk), static_cast(nrhs), static_cast(m_orig), - &alpha_one, d_VT_.ptr, dtype, static_cast(vtrows_stored), d_B.ptr, dtype, static_cast(m_orig), - &beta_zero, d_tmp.ptr, dtype, static_cast(kk), compute, internal::cuda_gemm_algo())); + internal::cublaslt_gemm(cublas_lt_, cublas_, CUBLAS_OP_N, CUBLAS_OP_N, kk, nrhs, m_orig, &alpha_one, + static_cast(d_VT_.ptr), vtrows_stored, + static_cast(d_B.ptr), m_orig, &beta_zero, + static_cast(d_tmp.ptr), kk, &gemm_workspace_, stream_); } } @@ -458,21 +465,19 @@ class GpuSVD { { internal::DeviceBuffer d_X(static_cast(n_orig) * static_cast(nrhs) * sizeof(Scalar)); Scalar alpha_one(1), beta_zero(0); - constexpr cudaDataType_t dtype = internal::cuda_data_type::value; - constexpr cublasComputeType_t compute = internal::cuda_compute_type::value; if (!transposed_) { const Index vtrows = (options_ & ComputeFullV) ? n_ : k; - EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast(n_orig), - static_cast(nrhs), static_cast(kk), &alpha_one, d_VT_.ptr, dtype, - static_cast(vtrows), d_tmp.ptr, dtype, static_cast(kk), &beta_zero, - d_X.ptr, dtype, static_cast(n_orig), compute, internal::cuda_gemm_algo())); + internal::cublaslt_gemm(cublas_lt_, cublas_, CUBLAS_OP_C, CUBLAS_OP_N, n_orig, nrhs, kk, &alpha_one, + static_cast(d_VT_.ptr), vtrows, + static_cast(d_tmp.ptr), kk, &beta_zero, + static_cast(d_X.ptr), n_orig, &gemm_workspace_, stream_); } else { // U_stored is m_×ucols. V_orig = U_stored[:,:kk]. NoTrans × tmp. - EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast(n_orig), - static_cast(nrhs), static_cast(kk), &alpha_one, d_U_.ptr, dtype, - static_cast(m_), d_tmp.ptr, dtype, static_cast(kk), &beta_zero, - d_X.ptr, dtype, static_cast(n_orig), compute, internal::cuda_gemm_algo())); + internal::cublaslt_gemm(cublas_lt_, cublas_, CUBLAS_OP_N, CUBLAS_OP_N, n_orig, nrhs, kk, &alpha_one, + static_cast(d_U_.ptr), m_, static_cast(d_tmp.ptr), + kk, &beta_zero, static_cast(d_X.ptr), n_orig, &gemm_workspace_, + stream_); } EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_X.ptr, diff --git a/Eigen/src/GPU/GpuSparseContext.h b/Eigen/src/GPU/GpuSparseContext.h index 5e313acda..cbffe25fd 100644 --- a/Eigen/src/GPU/GpuSparseContext.h +++ b/Eigen/src/GPU/GpuSparseContext.h @@ -10,16 +10,25 @@ // GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix // multiply (SpMM) via cuSPARSE. // -// GpuSparseContext manages a cuSPARSE handle and device buffers. It accepts -// Eigen SparseMatrix (CSC) and performs SpMV/SpMM on the -// GPU. RowMajor input is implicitly converted to ColMajor. +// GpuSparseContext manages cuSPARSE descriptors and device buffers. It accepts +// Eigen SparseMatrix (CSC) and performs SpMV/SpMM on the GPU. +// RowMajor input is implicitly converted to ColMajor. +// +// Can borrow a GpuContext for same-stream execution with BLAS-1 ops (zero +// event overhead in iterative solvers like CG). // // Usage: +// // Standalone (own stream): // GpuSparseContext ctx; -// VectorXd y = ctx.multiply(A, x); // y = A * x -// ctx.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y -// VectorXd z = ctx.multiplyT(A, x); // z = A^T * x -// MatrixXd Y = ctx.multiplyMat(A, X); // Y = A * X (multiple RHS) +// VectorXd y = ctx.multiply(A, x); +// +// // Shared context (same stream as BLAS-1 ops): +// GpuContext gpu_ctx; +// GpuSparseContext sparse_ctx(gpu_ctx); +// VectorXd y = sparse_ctx.multiply(A, x); +// +// // Device-resident (no host roundtrip): +// sparse_ctx.multiply(A, d_x, d_y); // DeviceMatrix in/out #ifndef EIGEN_GPU_SPARSE_CONTEXT_H #define EIGEN_GPU_SPARSE_CONTEXT_H @@ -31,6 +40,57 @@ namespace Eigen { +// Forward declarations. +template +class GpuSparseContext; +template +class DeviceSparseView; + +/** SpMV expression: DeviceSparseView * DeviceMatrix → SpMVExpr. + * Evaluated by DeviceMatrix::operator=(SpMVExpr). */ +template +class SpMVExpr { + public: + using Scalar = Scalar_; + SpMVExpr(const DeviceSparseView& view, const DeviceMatrix& x) : view_(view), x_(x) {} + const DeviceSparseView& view() const { return view_; } + const DeviceMatrix& x() const { return x_; } + + private: + const DeviceSparseView& view_; + const DeviceMatrix& x_; +}; + +/** Device-resident sparse matrix view. Returned by GpuSparseContext::deviceView(). + * Lightweight handle referencing the context's cached device data. + * + * \warning One GpuSparseContext caches one sparse matrix at a time. + * Creating a second deviceView on the same context overwrites the first. + * For multiple simultaneous sparse matrices, use separate GpuSparseContext + * instances (they can share a GpuContext for same-stream execution). + * + * Supports `d_y = d_A * d_x` via SpMVExpr. */ +template +class DeviceSparseView { + public: + using Scalar = Scalar_; + using SpMat = SparseMatrix; + + DeviceSparseView(GpuSparseContext& ctx, const SpMat& A) : ctx_(ctx), A_(A) {} + + /** SpMV expression: d_A * d_x. Evaluated by DeviceMatrix::operator=. */ + SpMVExpr operator*(const DeviceMatrix& x) const { return SpMVExpr(*this, x); } + + Index rows() const { return A_.rows(); } + Index cols() const { return A_.cols(); } + const GpuSparseContext& context() const { return ctx_; } + const SpMat& matrix() const { return A_; } + + private: + GpuSparseContext& ctx_; + const SpMat& A_; +}; + template class GpuSparseContext { public: @@ -41,22 +101,47 @@ class GpuSparseContext { using DenseVector = Matrix; using DenseMatrix = Matrix; - GpuSparseContext() { + /** Standalone: creates own stream and cuSPARSE handle. */ + GpuSparseContext() : owns_handle_(true) { EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); + owns_stream_ = true; EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_)); EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_)); } + /** Borrow a GpuContext: shares stream and cuSPARSE handle. + * The GpuContext must outlive this GpuSparseContext. */ + explicit GpuSparseContext(GpuContext& ctx) + : stream_(ctx.stream()), handle_(ctx.cusparseHandle()), owns_stream_(false), owns_handle_(false) {} + ~GpuSparseContext() { destroy_descriptors(); - if (handle_) (void)cusparseDestroy(handle_); - if (stream_) (void)cudaStreamDestroy(stream_); + if (owns_handle_ && handle_) (void)cusparseDestroy(handle_); + if (owns_stream_ && stream_) (void)cudaStreamDestroy(stream_); } GpuSparseContext(const GpuSparseContext&) = delete; GpuSparseContext& operator=(const GpuSparseContext&) = delete; - // ---- SpMV: y = A * x ----------------------------------------------------- + // ---- Device sparse view (for expression syntax: d_y = d_A * d_x) ---------- + + /** Upload a sparse matrix to device and return a lightweight view. + * The sparse data is uploaded immediately and cached in this context. + * The returned view can be used for repeated SpMV without re-uploading. + * If the matrix values change, call deviceView() again to re-upload. + * + * \warning One context caches one matrix. Calling deviceView() again + * overwrites the previous upload. For multiple simultaneous matrices, + * use separate GpuSparseContext instances sharing the same GpuContext. + * + * Supports `d_y = d_A * d_x` expression syntax. */ + DeviceSparseView deviceView(const SpMat& A) { + eigen_assert(A.isCompressed()); + upload_sparse(A); + return DeviceSparseView(*this, A); + } + + // ---- SpMV: y = A * x (host vectors) -------------------------------------- /** Compute y = A * x. Returns y as a new dense vector. */ template @@ -64,34 +149,52 @@ class GpuSparseContext { const SpMat mat(A.derived()); DenseVector y(mat.rows()); y.setZero(); - multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE); + multiply_host_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE); return y; } - /** Compute y = alpha * op(A) * x + beta * y (in-place). */ + /** Compute y = alpha * op(A) * x + beta * y (in-place, host vectors). */ template void multiply(const SparseMatrixBase& A, const MatrixBase& x, MatrixBase& y, Scalar alpha = Scalar(1), Scalar beta = Scalar(0), cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) { const SpMat mat(A.derived()); - multiply_impl(mat, x.derived(), y.derived(), alpha, beta, op); + multiply_host_impl(mat, x.derived(), y.derived(), alpha, beta, op); } - // ---- SpMV transpose: y = A^T * x ----------------------------------------- + // ---- SpMV: y = A * x (DeviceMatrix, no host roundtrip) ------------------- - /** Compute y = A^T * x. Returns y as a new dense vector. */ + /** Compute d_y = A * d_x. Device-resident, no host transfer. + * Sparse matrix A is uploaded to device (cached). Dense vectors stay on device. */ + template + void multiply(const SparseMatrixBase& A, const DeviceMatrix& d_x, DeviceMatrix& d_y) { + const SpMat mat(A.derived()); + multiply_device_impl(mat, d_x, d_y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE); + } + + /** Compute d_y = alpha * op(A) * d_x + beta * d_y (DeviceMatrix, in-place). */ + template + void multiply(const SparseMatrixBase& A, const DeviceMatrix& d_x, DeviceMatrix& d_y, + Scalar alpha, Scalar beta, cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) { + const SpMat mat(A.derived()); + multiply_device_impl(mat, d_x, d_y, alpha, beta, op); + } + + // ---- SpMV transpose ------------------------------------------------------- + + /** Compute y = A^T * x (host vectors). */ template DenseVector multiplyT(const SparseMatrixBase& A, const MatrixBase& x) { const SpMat mat(A.derived()); DenseVector y(mat.cols()); y.setZero(); - multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE); + multiply_host_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE); return y; } - // ---- SpMM: Y = A * X (multiple RHS) -------------------------------------- + // ---- SpMM: Y = A * X (host, multiple RHS) -------------------------------- - /** Compute Y = A * X where X is a dense matrix (multiple RHS). Returns Y. */ + /** Compute Y = A * X where X is a dense matrix. Returns Y. */ template DenseMatrix multiplyMat(const SparseMatrixBase& A, const MatrixBase& X) { const SpMat mat(A.derived()); @@ -114,32 +217,37 @@ class GpuSparseContext { private: cudaStream_t stream_ = nullptr; cusparseHandle_t handle_ = nullptr; + bool owns_stream_ = false; + bool owns_handle_ = false; - // Cached device buffers (grow-only). + // Cached device buffers for sparse matrix (grow-only). internal::DeviceBuffer d_outerPtr_; internal::DeviceBuffer d_innerIdx_; internal::DeviceBuffer d_values_; - internal::DeviceBuffer d_x_; - internal::DeviceBuffer d_y_; - internal::DeviceBuffer d_workspace_; size_t d_outerPtr_size_ = 0; size_t d_innerIdx_size_ = 0; size_t d_values_size_ = 0; + + // Cached device buffers for host-API dense vectors (grow-only). + internal::DeviceBuffer d_x_; + internal::DeviceBuffer d_y_; size_t d_x_size_ = 0; size_t d_y_size_ = 0; - size_t d_workspace_size_ = 0; - // Cached cuSPARSE descriptors. + mutable internal::DeviceBuffer d_workspace_; + mutable size_t d_workspace_size_ = 0; + + // Cached cuSPARSE sparse matrix descriptor. cusparseSpMatDescr_t spmat_desc_ = nullptr; Index cached_rows_ = -1; Index cached_cols_ = -1; Index cached_nnz_ = -1; - // ---- SpMV implementation -------------------------------------------------- + // ---- SpMV with host vectors (upload/download per call) -------------------- template - void multiply_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta, - cusparseOperation_t op) { + void multiply_host_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta, + cusparseOperation_t op) { eigen_assert(A.isCompressed()); const Index m = A.rows(); @@ -159,16 +267,13 @@ class GpuSparseContext { return; } - // Upload sparse matrix to device. upload_sparse(A); - // Upload x to device. ensure_buffer(d_x_, d_x_size_, static_cast(x_size) * sizeof(Scalar)); const DenseVector x_tmp(x); EIGEN_CUDA_RUNTIME_CHECK( cudaMemcpyAsync(d_x_.ptr, x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_)); - // Upload y to device (for beta != 0). ensure_buffer(d_y_, d_y_size_, static_cast(y_size) * sizeof(Scalar)); if (beta != Scalar(0)) { const DenseVector y_tmp(y); @@ -176,27 +281,80 @@ class GpuSparseContext { cudaMemcpyAsync(d_y_.ptr, y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_)); } - // Create dense vector descriptors. + exec_spmv(x_size, y_size, d_x_.ptr, d_y_.ptr, alpha, beta, op); + + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(y.data(), d_y_.ptr, y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + } + + // ---- SpMV with DeviceMatrix (no host transfer) ---------------------------- + + // Called by public multiply(A, d_x, d_y) — always re-uploads A. + void multiply_device_impl(const SpMat& A, const DeviceMatrix& d_x, DeviceMatrix& d_y, Scalar alpha, + Scalar beta, cusparseOperation_t op) { + upload_sparse(A); + spmv_device_exec(d_x, d_y, alpha, beta, op); + } + + public: + /** Execute SpMV using the already-uploaded sparse matrix (no re-upload). + * Used by SpMVExpr (d_y = d_A * d_x) for cached deviceView() paths. + * The sparse matrix must have been uploaded via deviceView() or multiply(). */ + void spmv_device_exec(const DeviceMatrix& d_x, DeviceMatrix& d_y, Scalar alpha = Scalar(1), + Scalar beta = Scalar(0), cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) const { + eigen_assert(spmat_desc_ && "sparse matrix not uploaded — call deviceView() or multiply() first"); + // cuSPARSE SpMV: y must not alias x (undefined behavior). + eigen_assert(d_x.data() != d_y.data() && "SpMV: output aliases input vector"); + + const Index m = cached_rows_; + const Index n = cached_cols_; + const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m; + const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n; + + eigen_assert(d_x.rows() * d_x.cols() == x_size); + + if (m == 0 || n == 0 || cached_nnz_ == 0) { + d_y.resize(y_size, 1); + if (beta == Scalar(0)) { + d_y.setZero(); + } + return; + } + + // Ensure d_y is allocated. + if (d_y.rows() * d_y.cols() != y_size) { + d_y.resize(y_size, 1); + } + + // Wait for input data to be ready on this stream. + d_x.waitReady(stream_); + d_y.waitReady(stream_); + + exec_spmv(x_size, y_size, const_cast(static_cast(d_x.data())), static_cast(d_y.data()), + alpha, beta, op); + + d_y.recordReady(stream_); + } + + private: + // ---- Shared SpMV execution ------------------------------------------------ + + void exec_spmv(Index x_size, Index y_size, void* d_x_ptr, void* d_y_ptr, Scalar alpha, Scalar beta, + cusparseOperation_t op) const { constexpr cudaDataType_t dtype = internal::cuda_data_type::value; cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr; - EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_.ptr, dtype)); - EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_.ptr, dtype)); + EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_ptr, dtype)); + EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_ptr, dtype)); - // Query workspace size. size_t ws_size = 0; EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype, CUSPARSE_SPMV_ALG_DEFAULT, &ws_size)); ensure_buffer(d_workspace_, d_workspace_size_, ws_size); - // Execute SpMV. EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype, CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.ptr)); - // Download result. - EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(y.data(), d_y_.ptr, y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_)); - EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); - (void)cusparseDestroyDnVec(x_desc); (void)cusparseDestroyDnVec(y_desc); } @@ -222,7 +380,6 @@ class GpuSparseContext { upload_sparse(A); - // Upload X to device. const size_t x_bytes = static_cast(k) * static_cast(n) * sizeof(Scalar); const size_t y_bytes = static_cast(m) * static_cast(n) * sizeof(Scalar); ensure_buffer(d_x_, d_x_size_, x_bytes); @@ -232,24 +389,19 @@ class GpuSparseContext { EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.ptr, Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_)); } - // Create dense matrix descriptors. constexpr cudaDataType_t dtype = internal::cuda_data_type::value; cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr; - // Eigen is column-major, so ld = rows. EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k, n, k, d_x_.ptr, dtype, CUSPARSE_ORDER_COL)); EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m, n, m, d_y_.ptr, dtype, CUSPARSE_ORDER_COL)); - // Query workspace. size_t ws_size = 0; EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size)); ensure_buffer(d_workspace_, d_workspace_size_, ws_size); - // Execute SpMM. EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.ptr)); - // Download result. EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.ptr, y_bytes, cudaMemcpyDeviceToHost, stream_)); EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); @@ -278,21 +430,18 @@ class GpuSparseContext { cudaMemcpyAsync(d_innerIdx_.ptr, A.innerIndexPtr(), inner_bytes, cudaMemcpyHostToDevice, stream_)); EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, A.valuePtr(), val_bytes, cudaMemcpyHostToDevice, stream_)); - // Recreate descriptor if shape changed. if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) { destroy_descriptors(); constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I; constexpr cudaDataType_t val_type = internal::cuda_data_type::value; - // ColMajor → CSC. outerIndexPtr = col offsets, innerIndexPtr = row indices. EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr, idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO, val_type)); cached_rows_ = m; cached_cols_ = n; cached_nnz_ = nnz; } else { - // Same shape — just update pointers. EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr)); } } @@ -307,7 +456,7 @@ class GpuSparseContext { cached_nnz_ = -1; } - void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) { + void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) const { if (needed > current_size) { if (buf.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); buf = internal::DeviceBuffer(needed); @@ -316,6 +465,17 @@ class GpuSparseContext { } }; +// ---- DeviceMatrix::operator=(SpMVExpr) out-of-line definition ---------------- +// Defined here because it needs the full GpuSparseContext definition. + +template +DeviceMatrix& DeviceMatrix::operator=(const SpMVExpr& expr) { + // Use spmv_device_exec — the sparse matrix was already uploaded by deviceView(). + // No re-upload on repeated SpMV with the same view. + expr.view().context().spmv_device_exec(expr.x(), *this, Scalar_(1), Scalar_(0), CUSPARSE_OPERATION_NON_TRANSPOSE); + return *this; +} + } // namespace Eigen #endif // EIGEN_GPU_SPARSE_CONTEXT_H diff --git a/Eigen/src/GPU/GpuSupport.h b/Eigen/src/GPU/GpuSupport.h index 302121017..9946d2519 100644 --- a/Eigen/src/GPU/GpuSupport.h +++ b/Eigen/src/GPU/GpuSupport.h @@ -21,6 +21,7 @@ #include "./InternalHeaderCheck.h" #include +#include namespace Eigen { namespace internal { @@ -36,26 +37,99 @@ namespace internal { // ---- RAII: device buffer ---------------------------------------------------- +// Thread-local pool of small device buffers to avoid cudaMalloc/cudaFree +// overhead for tiny allocations (e.g., DeviceScalar). Buffers up to +// kSmallBufferThreshold bytes are recycled; larger allocations bypass the pool. +template +struct DeviceBufferPool { + static constexpr size_t kSmallBufferThreshold = SmallBufferThreshold; + static constexpr size_t kMaxPoolSize = MaxPoolSize; + + struct Entry { + void* ptr; + size_t bytes; + }; + + ~DeviceBufferPool() { + for (auto& e : free_list_) (void)cudaFree(e.ptr); + } + + void* allocate(size_t bytes) { + // Search for a buffer of sufficient size. + for (size_t i = 0; i < free_list_.size(); ++i) { + if (free_list_[i].bytes >= bytes) { + void* p = free_list_[i].ptr; + free_list_[i] = free_list_.back(); + free_list_.pop_back(); + return p; + } + } + // No suitable buffer found — allocate new. + void* p = nullptr; + EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes)); + return p; + } + + void deallocate(void* p, size_t bytes) { + if (free_list_.size() < kMaxPoolSize) { + free_list_.push_back({p, bytes}); + } else { + (void)cudaFree(p); + } + } + + static DeviceBufferPool& threadLocal() { + thread_local DeviceBufferPool pool; + return pool; + } + + private: + std::vector free_list_; +}; + struct DeviceBuffer { void* ptr = nullptr; DeviceBuffer() = default; - explicit DeviceBuffer(size_t bytes) { - if (bytes > 0) EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes)); + explicit DeviceBuffer(size_t bytes) : size_(bytes) { + if (bytes > 0) { + if (bytes <= DeviceBufferPool<>::kSmallBufferThreshold) { + ptr = DeviceBufferPool<>::threadLocal().allocate(bytes); + } else { + EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes)); + } + } } ~DeviceBuffer() { - if (ptr) (void)cudaFree(ptr); // destructor: ignore errors + if (ptr) { + if (size_ <= DeviceBufferPool<>::kSmallBufferThreshold) { + DeviceBufferPool<>::threadLocal().deallocate(ptr, size_); + } else { + (void)cudaFree(ptr); + } + } } // Move-only. - DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr) { o.ptr = nullptr; } + DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr), size_(o.size_) { + o.ptr = nullptr; + o.size_ = 0; + } DeviceBuffer& operator=(DeviceBuffer&& o) noexcept { if (this != &o) { - if (ptr) (void)cudaFree(ptr); + if (ptr) { + if (size_ <= DeviceBufferPool<>::kSmallBufferThreshold) { + DeviceBufferPool<>::threadLocal().deallocate(ptr, size_); + } else { + (void)cudaFree(ptr); + } + } ptr = o.ptr; + size_ = o.size_; o.ptr = nullptr; + o.size_ = 0; } return *this; } @@ -63,12 +137,19 @@ struct DeviceBuffer { DeviceBuffer(const DeviceBuffer&) = delete; DeviceBuffer& operator=(const DeviceBuffer&) = delete; + size_t size() const { return size_; } + // Adopt an existing device pointer. Caller relinquishes ownership. + // Adopted buffers bypass the pool on destruction. static DeviceBuffer adopt(void* p) { DeviceBuffer b; b.ptr = p; + b.size_ = DeviceBufferPool<>::kSmallBufferThreshold + 1; // force cudaFree return b; } + + private: + size_t size_ = 0; }; // ---- Scalar → cudaDataType_t ------------------------------------------------ diff --git a/Eigen/src/GPU/README.md b/Eigen/src/GPU/README.md index 10e9b3113..dcfdaa8ed 100644 --- a/Eigen/src/GPU/README.md +++ b/Eigen/src/GPU/README.md @@ -62,6 +62,23 @@ The GPU version reads like CPU Eigen with explicit upload/download for dense operations, and an almost identical API for sparse solvers. Unsupported expressions are compile errors. +**Standalone module.** `Eigen/GPU` does not modify or depend on Eigen's Core +expression template system (`MatrixBase`, `CwiseBinaryOp`, etc.). +`DeviceMatrix` is not an Eigen expression type and does not inherit from +`MatrixBase`. The expression layer is a thin compile-time dispatch where every +supported expression maps to a single NVIDIA library call. There is no +coefficient-level evaluation, lazy fusion, or packet operations. + +**Interoperability where useful.** `DeviceMatrix` provides the same operator +signatures as `Matrix` for common vector operations: `+=`, `-=`, `*=`, +`dot()`, `squaredNorm()`, `norm()`, `setZero()`, and `noalias()`. This makes +`DeviceMatrix` usable as a drop-in `VectorType` in Eigen algorithm templates +that rely on these operations. For example, Eigen's `conjugate_gradient()` +template works with `DeviceMatrix` with a single typedef change -- no +modifications to the algorithm or the expression template system. Conjugate +gradient is just the motivating example; we are open to expanding operator +coverage as needed to support other high-level Eigen algorithms on the GPU. + **Explicit over implicit.** Host-device transfers, stream management, and library handle lifetimes are visible in the API. There are no hidden allocations or synchronizations except where documented (e.g., `toHost()` must @@ -96,6 +113,27 @@ MatrixXd C = transfer.get(); `selfadjointView()`, `llt()`, `lu()`. These return lightweight expression objects that are evaluated when assigned. +For BLAS Level-1 operations, `DeviceMatrix` also provides `dot()`, `norm()`, +`squaredNorm()`, `setZero()`, `noalias()`, and arithmetic operators +(`+=`, `-=`, `*=`) that dispatch to cuBLAS `axpy`, `nrm2`, `dot`, and +`geam`. These are the operations needed by iterative solvers. + +### `DeviceScalar` + +A device-resident scalar value. Reductions like `dot()`, `norm()`, and +`squaredNorm()` return `DeviceScalar` instead of a host scalar, deferring +the host synchronization until the value is actually needed: + +```cpp +auto dot_val = d_x.dot(d_y); // DeviceScalar -- no sync +auto norm_sq = d_r.squaredNorm(); // DeviceScalar -- no sync +Scalar alpha = dot_val / norm_sq; // sync here (implicit conversion) +d_x += alpha * d_p; // host scalar * DeviceMatrix (axpy) +``` + +Division between `DeviceScalar` values (real types only) is performed on +device via NPP, avoiding extra synchronizations. + ### `GpuContext` Every GPU operation needs a CUDA stream and library handles (cuBLAS, @@ -118,6 +156,12 @@ d_C1.device(ctx1) = d_A1 * d_B1; // runs on stream 1 d_C2.device(ctx2) = d_A2 * d_B2; // runs on stream 2 (concurrently) ``` +To integrate with existing CUDA code, borrow an existing stream: + +```cpp +GpuContext ctx(my_existing_stream); // wraps stream, does not take ownership +``` + ## Usage ### Matrix operations (cuBLAS) @@ -133,7 +177,7 @@ d_C = d_A * d_B.transpose(); // Scaled and accumulated d_C += 2.0 * d_A * d_B; // alpha=2, beta=1 -d_C.device(ctx) -= d_A * d_B; // alpha=-1, beta=1 (requires explicit context) +d_C.device(ctx) -= d_A * d_B; // alpha=-1, beta=1 (GEMM requires explicit context for -=) // Triangular solve (TRSM) d_X = d_A.triangularView().solve(d_B); @@ -145,6 +189,30 @@ d_C = d_A.selfadjointView() * d_B; d_C.selfadjointView().rankUpdate(d_A); // C += A * A^H ``` +### BLAS Level-1 operations + +```cpp +// Dot product and norms (return DeviceScalar -- no sync until read) +auto dot_val = d_x.dot(d_y); // cublasDdot / cublasCdotc +auto norm_val = d_r.norm(); // cublasDnrm2 +double n = norm_val; // implicit conversion triggers sync + +// Vector arithmetic (cuBLAS axpy / geam) +d_x += alpha * d_p; // axpy: x = x + alpha * p +d_x -= alpha * d_p; // axpy: x = x - alpha * p +d_x *= alpha; // scal: x = alpha * x +d_r.setZero(); // cudaMemsetAsync + +// DeviceScalar arithmetic (stays on device, real types only) +auto alpha = absNew / dot_val; // device-side division via NPP +d_x += alpha * d_p; // DeviceScalar * DeviceMatrix (axpy with device pointer) + +// Matrix add/subtract (cuBLAS geam) +DeviceMatrix d_C = d_A + d_B; // C = A + B +d_C = d_A + 2.0 * d_B; // C = A + 2*B +d_C = d_A - d_B; // C = A - B +``` + ### Dense solvers (cuSOLVER) **One-shot expression syntax** -- Convenient, re-factorizes each time: @@ -183,6 +251,7 @@ GpuSVD svd; svd.compute(d_A, ComputeThinU | ComputeThinV); VectorXd S = svd.singularValues(); // downloads to host MatrixXd U = svd.matrixU(); // downloads to host +MatrixXd V = svd.matrixV(); // V (matches JacobiSVD) MatrixXd VT = svd.matrixVT(); // V^T (matches cuSOLVER) // Self-adjoint eigenvalue decomposition (results downloaded on access) @@ -253,21 +322,60 @@ MatrixXcf C = fft.inv2d(B); // 2D inverse (scaled by 1/(rows*cols)) SparseMatrix A = ...; VectorXd x = ...; -GpuSparseContext ctx; -VectorXd y = ctx.multiply(A, x); // y = A * x -VectorXd z = ctx.multiplyT(A, x); // z = A^T * x -ctx.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y +// Host vectors (upload/download handled internally) +GpuSparseContext spmv; +VectorXd y = spmv.multiply(A, x); // y = A * x +VectorXd z = spmv.multiplyT(A, x); // z = A^T * x +spmv.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y +MatrixXd Y = spmv.multiplyMat(A, X); // Y = A * X (SpMM) -// Multiple RHS (SpMM) -MatrixXd Y = ctx.multiplyMat(A, X); // Y = A * X +// Device-resident SpMV (sparse matrix cached on device) +GpuSparseContext spmv(ctx); // share GpuContext for same-stream +auto d_A = spmv.deviceView(A); // upload sparse matrix once +d_y = d_A * d_x; // operator syntax, stays on device +``` + +### Eigen algorithm interop (example: Conjugate gradient) + +The BLAS-1 operators and `DeviceSparseView` make `DeviceMatrix` usable as a +vector type in GPU implementations of algorithms like conjugate gradient. +Conjugate gradient is the motivating example -- a GPU CG implementation +uses the same operations as the CPU version: + +```cpp +GpuContext ctx; +GpuSparseContext spmv(ctx); +auto d_A = spmv.deviceView(A); // sparse matrix on device +auto d_b = DeviceMatrix::fromHost(b); +auto d_x = DeviceMatrix::fromHost(x0); + +// CG iteration using DeviceMatrix operators +DeviceMatrix d_r = d_b; // r = b (deep copy via geam) +DeviceMatrix d_p(n), d_tmp(n); +d_tmp = d_A * d_x; // SpMV (device-resident) +d_r -= d_tmp; // axpy +d_p = d_r.clone(); +RealScalar absNew = d_r.squaredNorm(); // DeviceScalar -> implicit sync + +for (int i = 0; i < maxIters && absNew > tol * tol; ++i) { + d_tmp = d_A * d_p; // SpMV + auto alpha = absNew / d_p.dot(d_tmp); // host / DeviceScalar -> DeviceScalar + d_x += alpha * d_p; // axpy with DeviceScalar + d_r -= alpha * d_tmp; // axpy with DeviceScalar + RealScalar absOld = absNew; + absNew = d_r.squaredNorm(); // DeviceScalar -> implicit sync + d_p *= Scalar(absNew / absOld); // scal (host scalars) + d_p += d_r; // axpy +} +MatrixXd x = d_x.toHost(); ``` ### Precision control -GEMM dispatch enables tensor core algorithms by default, allowing cuBLAS to -choose the fastest algorithm for the given precision and architecture. For -double precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64 -results computed faster via tensor cores. +GEMM dispatch uses `cublasLtMatmul` with heuristic algorithm selection, +enabling cuBLAS to choose tensor core algorithms when beneficial. For double +precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64 results +computed faster via tensor cores. | Macro | Effect | |---|---| @@ -290,6 +398,7 @@ Mandatory sync points: - `fromHost()` -- Synchronizes to complete the upload before returning - `toHost()` / `HostTransfer::get()` -- Must deliver data to host - `info()` -- Must read the factorization status +- `DeviceScalar` implicit conversion -- Downloads scalar from device **Cross-stream safety** is automatic. `DeviceMatrix` tracks write completion via CUDA events. When a matrix written on stream A is read on stream B, the @@ -307,21 +416,33 @@ noted otherwise). | DeviceMatrix expression | Library call | Parameters | |---|---|---| -| `C = A * B` | `cublasGemmEx` | transA=N, transB=N, alpha=1, beta=0 | -| `C = A.adjoint() * B` | `cublasGemmEx` | transA=C, transB=N | -| `C = A.transpose() * B` | `cublasGemmEx` | transA=T, transB=N | -| `C = A * B.adjoint()` | `cublasGemmEx` | transA=N, transB=C | -| `C = A * B.transpose()` | `cublasGemmEx` | transA=N, transB=T | -| `C = alpha * A * B` | `cublasGemmEx` | alpha from LHS | -| `C = A * (alpha * B)` | `cublasGemmEx` | alpha from RHS | -| `C += A * B` | `cublasGemmEx` | alpha=1, beta=1 | -| `C.device(ctx) -= A * B` | `cublasGemmEx` | alpha=-1, beta=1 | +| `C = A * B` | `cublasLtMatmul` | transA=N, transB=N, alpha=1, beta=0 | +| `C = A.adjoint() * B` | `cublasLtMatmul` | transA=C, transB=N | +| `C = A.transpose() * B` | `cublasLtMatmul` | transA=T, transB=N | +| `C = A * B.adjoint()` | `cublasLtMatmul` | transA=N, transB=C | +| `C = A * B.transpose()` | `cublasLtMatmul` | transA=N, transB=T | +| `C = alpha * A * B` | `cublasLtMatmul` | alpha from LHS | +| `C = A * (alpha * B)` | `cublasLtMatmul` | alpha from RHS | +| `C += A * B` | `cublasLtMatmul` | alpha=1, beta=1 | +| `C.device(ctx) -= A * B` | `cublasLtMatmul` | alpha=-1, beta=1 | | `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs | | `X = A.llt().solve(B)` | same | uplo=Upper | | `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs | | `X = A.triangularView().solve(B)` | `cublasXtrsm` | side=L, uplo, diag=NonUnit | | `C = A.selfadjointView() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo | | `C.selfadjointView().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N | +| `C = A + B` | `cublasXgeam` | alpha=1, beta=1 | +| `C = A + alpha * B` | `cublasXgeam` | alpha=1, beta from scaled | +| `C = A - B` | `cublasXgeam` | alpha=1, beta=-1 | +| `C = A - alpha * B` | `cublasXgeam` | alpha=1, beta=-scaled | +| `x += alpha * y` | `cublasXaxpy` | alpha (host scalar) | +| `x += dAlpha * y` | `cublasXaxpy` | alpha (DeviceScalar, device pointer mode) | +| `x -= alpha * y` | `cublasXaxpy` | alpha negated | +| `x *= alpha` | `cublasXscal` | alpha (host or DeviceScalar) | +| `x.dot(y)` | `cublasXdot` / `cublasXdotc` | returns `DeviceScalar` | +| `x.norm()` | `cublasXnrm2` | returns `DeviceScalar` | +| `x.squaredNorm()` | `cublasXdot(x, x)` | returns `DeviceScalar` | +| `d_y = view * d_x` | `cusparseSpMV` | device-resident SpMV | ### `DeviceMatrix` @@ -332,11 +453,12 @@ one column. ```cpp // Construction DeviceMatrix() // Empty (0x0) +DeviceMatrix(Index n) // Allocate column vector (n x 1) DeviceMatrix(rows, cols) // Allocate uninitialized // Upload / download static DeviceMatrix fromHost(matrix, stream=nullptr) // -> DeviceMatrix (syncs) -static DeviceMatrix fromHostAsync(ptr, rows, cols, outerStride, s) // -> DeviceMatrix (no sync, caller manages ptr lifetime) +static DeviceMatrix fromHostAsync(ptr, rows, cols, stream) // -> DeviceMatrix (no sync, caller manages ptr lifetime) PlainMatrix toHost(stream=nullptr) // -> host Matrix (syncs) HostTransfer toHostAsync(stream=nullptr) // -> HostTransfer future (no sync) DeviceMatrix clone(stream=nullptr) // -> DeviceMatrix (D2D copy, async) @@ -357,6 +479,50 @@ LuExpr lu() // -> .solve(d_B) -> De TriangularView triangularView() // -> .solve(d_B) -> DeviceMatrix (TRSM) SelfAdjointView selfadjointView() // -> * d_B (SYMM), .rankUpdate(d_A) (SYRK) DeviceAssignment device(GpuContext& ctx) // Bind assignment to explicit stream +DeviceMatrix& noalias() // No-op (all ops are implicitly noalias) + +// BLAS Level-1 (all have overloads with explicit GpuContext& parameter) +DeviceScalar dot(const DeviceMatrix& other) // cuBLAS dot/dotc -> DeviceScalar +DeviceScalar norm() // cuBLAS nrm2 -> DeviceScalar +DeviceScalar squaredNorm() // dot(self, self) -> DeviceScalar (no sync) +void setZero() // cudaMemsetAsync +void addScaled(GpuContext&, Scalar alpha, const DeviceMatrix& x) // this += alpha * x (axpy) +void scale(GpuContext&, Scalar alpha) // this *= alpha (scal) +void copyFrom(GpuContext&, const DeviceMatrix& other) // this = other (D2D copy) +DeviceMatrix& operator+=(Scalar * DeviceMatrix) // cuBLAS axpy +DeviceMatrix& operator-=(Scalar * DeviceMatrix) // cuBLAS axpy (negated) +DeviceMatrix& operator+=(const DeviceMatrix&) // cuBLAS axpy +DeviceMatrix& operator-=(const DeviceMatrix&) // cuBLAS axpy +DeviceMatrix& operator+=(const DeviceScaledDevice&) // cuBLAS axpy (DeviceScalar * DeviceMatrix) +DeviceMatrix& operator-=(const DeviceScaledDevice&) // cuBLAS axpy (DeviceScalar * DeviceMatrix, negated) +DeviceMatrix& operator*=(Scalar) // cuBLAS scal +DeviceMatrix& operator*=(const DeviceScalar&) // cuBLAS scal (device pointer) +DeviceMatrix cwiseProduct(GpuContext&, const DeviceMatrix&) // NPP nppsMul (float/double only) +void cwiseProduct(GpuContext&, const DeviceMatrix&, const DeviceMatrix&) // in-place: this = a .* b + +// geam expressions (evaluated on assignment) +DeviceMatrix& operator=(const DeviceAddExpr&) // C = A + B, C = A + alpha*B, C = A - B, etc. +``` + +### `DeviceScalar` + +Device-resident scalar. Returned by `dot()`, `norm()`, and `squaredNorm()`. +Implicit conversion to `Scalar` triggers `cudaStreamSynchronize` + download. + +```cpp +DeviceScalar(cudaStream_t stream = nullptr) // Allocate uninitialized +DeviceScalar(Scalar host_val, cudaStream_t stream) // Upload host value + +Scalar get() // Download (syncs stream) + operator Scalar() // Implicit conversion (syncs) +Scalar* devicePtr() // Raw device pointer +cudaStream_t stream() + +// Device-side arithmetic (no host sync, real types only) +DeviceScalar operator/(DeviceScalar, DeviceScalar) // NPP nppsDiv +DeviceScalar operator/(Scalar, DeviceScalar) // upload + div +DeviceScalar operator/(DeviceScalar, Scalar) // upload + div +DeviceScalar operator-() // NPP nppsMulC(-1) ``` ### `GpuContext` @@ -365,11 +531,15 @@ Unified GPU execution context owning a CUDA stream and library handles. ```cpp GpuContext() // Creates dedicated stream + handles +GpuContext(cudaStream_t stream) // Borrow existing stream (not owned) static GpuContext& threadLocal() // Per-thread default (lazy-created) +static void setThreadLocal(GpuContext* ctx) // Override thread-local default (nullptr restores) cudaStream_t stream() cublasHandle_t cublasHandle() cusolverDnHandle_t cusolverHandle() +cublasLtHandle_t cublasLtHandle() // Lazy-initialized +cusparseHandle_t cusparseHandle() // Lazy-initialized ``` Non-copyable, non-movable (owns library handles). @@ -440,6 +610,7 @@ GpuSVD& compute(const DeviceMatrix& d_A, unsigned options = ComputeTh RealVector singularValues() // -> host vector (syncs, downloads) PlainMatrix matrixU() // -> host Matrix (syncs, downloads) +PlainMatrix matrixV() // -> host Matrix (V = VT^H, matches JacobiSVD) PlainMatrix matrixVT() // -> host Matrix (syncs, downloads V^T) PlainMatrix solve(const MatrixBase& B) // -> host Matrix (pseudoinverse) @@ -452,9 +623,9 @@ Index rows() / cols() cudaStream_t stream() ``` -**Note:** `singularValues()`, `matrixU()`, and `matrixVT()` download to host -on each call. Device-side accessors returning `DeviceMatrix` are planned but -not yet implemented. +**Note:** `singularValues()`, `matrixU()`, `matrixV()`, and `matrixVT()` +download to host on each call. Device-side accessors returning `DeviceMatrix` +are planned but not yet implemented. ### `GpuSelfAdjointEigenSolver` -- Eigendecomposition (cuSOLVER) @@ -544,28 +715,47 @@ the input scalar type (complex vs real). ### `GpuSparseContext` -- SpMV/SpMM (cuSPARSE) -Accepts `SparseMatrix`. All methods accept host data and -return host data. +Accepts `SparseMatrix`. ```cpp GpuSparseContext() // Creates own stream + cuSPARSE handle +GpuSparseContext(GpuContext& ctx) // Borrow GpuContext for same-stream execution -DenseVector multiply(A, x) // y = A * x -void multiply(A, x, y, alpha=1, beta=0, // y = alpha*op(A)*x + beta*y +// Host data in/out +DenseVector multiply(A, x) // y = A * x +void multiply(A, x, y, alpha=1, beta=0, // y = alpha*op(A)*x + beta*y op=CUSPARSE_OPERATION_NON_TRANSPOSE) -DenseVector multiplyT(A, x) // y = A^T * x -DenseMatrix multiplyMat(A, X) // Y = A * X (SpMM) +DenseVector multiplyT(A, x) // y = A^T * x +DenseMatrix multiplyMat(A, X) // Y = A * X (SpMM) + +// DeviceMatrix in/out (sparse matrix re-uploaded each call) +void multiply(A, d_x, d_y) // SpMV with device vectors +void multiply(A, d_x, d_y, alpha, beta, op) + +// Device-resident sparse matrix (upload once, reuse) +DeviceSparseView deviceView(A) // Upload sparse matrix, return view cudaStream_t stream() ``` +### `DeviceSparseView` -- Device-resident sparse matrix + +Returned by `GpuSparseContext::deviceView()`. Holds a sparse matrix on device +for repeated SpMV without re-uploading. + +```cpp +SpMVExpr operator*(const DeviceMatrix& d_x) // d_y = view * d_x (evaluated on assignment) +``` + ### Aliasing Unlike Eigen's `Matrix`, where omitting `.noalias()` triggers a copy to a temporary, DeviceMatrix dispatches directly to NVIDIA library calls which have no built-in aliasing protection. All operations are implicitly noalias. The caller must ensure operands don't alias the destination for GEMM and TRSM -(debug asserts catch violations). +(debug asserts catch violations). `geam` expressions (`d_C = d_A + alpha * d_B`) +are safe with aliasing. The `.noalias()` method exists as a no-op for Eigen +template compatibility. ## File layout @@ -573,12 +763,14 @@ The caller must ensure operands don't alias the destination for GEMM and TRSM |------|-----------|----------| | `GpuSupport.h` | `` | Error macro, `DeviceBuffer`, `cuda_data_type<>` | | `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` | -| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM expression wrappers | +| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM and geam expression wrappers | | `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers | | `DeviceSolverExpr.h` | `DeviceMatrix.h` | Solver expression wrappers (LLT, LU) | +| `DeviceScalar.h` | `GpuSupport.h`, `DeviceScalarOps.h` | `DeviceScalar<>` (device-resident scalar) | +| `DeviceScalarOps.h` | `` | Scalar div/neg/cwiseProduct via NPP | | `DeviceDispatch.h` | all above | All dispatch functions + `DeviceAssignment` | | `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `GpuContext` | -| `CuBlasSupport.h` | `GpuSupport.h`, `` | cuBLAS error macro, op/compute type maps | +| `CuBlasSupport.h` | `GpuSupport.h`, ``, `` | cuBLAS/cuBLASLt error macro, type maps | | `CuSolverSupport.h` | `GpuSupport.h`, `` | cuSOLVER params, fill-mode mapping | | `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization | | `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization | @@ -588,7 +780,7 @@ The caller must ensure operands don't alias the destination for GEMM and TRSM | `CuFftSupport.h` | `GpuSupport.h`, `` | cuFFT error macro, type-dispatch wrappers | | `GpuFFT.h` | `CuFftSupport.h`, `CuBlasSupport.h` | 1D/2D FFT with plan caching | | `CuSparseSupport.h` | `GpuSupport.h`, `` | cuSPARSE error macro | -| `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE | +| `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE, `DeviceSparseView` | | `CuDssSupport.h` | `GpuSupport.h`, `` | cuDSS error macro, type traits (optional) | | `GpuSparseSolverBase.h` | `CuDssSupport.h` | CRTP base for sparse solvers (optional) | | `GpuSparseLLT.h` | `GpuSparseSolverBase.h` | Sparse Cholesky via cuDSS (optional) | @@ -606,7 +798,7 @@ cmake -G Ninja -B build -S . \ cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu \ gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen \ - gpu_device_matrix gpu_cufft gpu_cusparse_spmv + gpu_device_matrix gpu_cufft gpu_cusparse_spmv gpu_cg ctest --test-dir build -R "gpu_" --output-on-failure # Sparse solvers (cuDSS -- separate install required) @@ -627,10 +819,19 @@ ctest --test-dir build -R gpu_cudss --output-on-failure Device-side accessors returning `DeviceMatrix` views of the internal buffers would allow chaining GPU operations (e.g., `svd.deviceU() * d_A`) without round-tripping through host memory. -- **Device-resident sparse matrix-vector products.** `GpuSparseContext` - currently operates on host vectors and matrices, uploading and downloading - on each call. The key missing piece is a `DeviceSparseView` that holds a - sparse matrix on device and supports operator syntax (`d_y = d_A * d_x`) - with `DeviceMatrix` operands -- keeping the entire SpMV/SpMM pipeline on - device. This is essential for iterative solvers and any workflow that chains - sparse and dense operations without returning to the host. +- **Batched API (`DeviceBatchMatrix`).** A strided batch of N identical-size + matrices dispatching to cuBLAS/cuSOLVER batched APIs (`cublasDgemmBatched`, + `cusolverDnXpotrfBatched`, etc.). This enables robotics and model-predictive + control workloads where many small independent systems are solved in + parallel. +- **cuTENSOR for Tensor module.** Replace the hand-written GPU tensor + contraction and reduction kernels (~2300 lines in + `TensorContractionGpu.h` / `TensorReductionGpu.h`) with cuTENSOR dispatch, + following the same library-dispatch pattern used by `Eigen/GPU`. +- **Unified/zero-copy memory for Jetson.** Use `cudaMallocManaged` or + `cudaHostAllocMapped` to eliminate `fromHost()` / `toHost()` copies on + integrated GPUs (Jetson) where CPU and GPU share DRAM. +- **Device-side Eigen interop.** Bridge between host-side `DeviceMatrix` + dispatch and device-side Eigen expression templates (Core + Tensor) running + inside CUDA kernels. Raw-pointer + `Map` / `TensorMap` as the zero-copy + interop surface. diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 9130a3cb7..4b28580c0 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -31,7 +31,10 @@ EIGEN_DONT_INLINE void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Index& iters, typename Dest::RealScalar& tol_error) { typedef typename Dest::RealScalar RealScalar; typedef typename Dest::Scalar Scalar; - typedef Matrix VectorType; + // Use Dest's plain (owning) type as VectorType. For CPU Matrix/Map this + // resolves to Matrix. For GPU DeviceMatrix, PlainObject + // is DeviceMatrix itself (already owning). + typedef typename Dest::PlainObject VectorType; RealScalar tol = tol_error; Index maxIters = iters; diff --git a/benchmarks/GPU/CMakeLists.txt b/benchmarks/GPU/CMakeLists.txt index 53400912c..70bb6c275 100644 --- a/benchmarks/GPU/CMakeLists.txt +++ b/benchmarks/GPU/CMakeLists.txt @@ -11,7 +11,7 @@ # ncu --set full -o profile ./build-bench-gpu/bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096 cmake_minimum_required(VERSION 3.18) -project(EigenGpuBenchmarks CXX) +project(EigenGpuBenchmarks CXX CUDA) find_package(benchmark REQUIRED) find_package(CUDAToolkit REQUIRED) @@ -55,3 +55,37 @@ eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITI # FFT benchmarks: 1D/2D C2C, R2C, C2R throughput and plan reuse. eigen_add_gpu_benchmark(bench_gpu_fft bench_gpu_fft.cpp LIBRARIES CUDA::cufft) eigen_add_gpu_benchmark(bench_gpu_fft_double bench_gpu_fft.cpp LIBRARIES CUDA::cufft DEFINITIONS SCALAR=double) + +# CG sync overhead benchmark: host vs device pointer mode for reductions. +# Uses CUDA kernels for device scalar arithmetic. +add_executable(bench_gpu_cg_sync bench_gpu_cg_sync.cu) +target_include_directories(bench_gpu_cg_sync PRIVATE + ${EIGEN_SOURCE_DIR} + ${CUDAToolkit_INCLUDE_DIRS}) +target_link_libraries(bench_gpu_cg_sync PRIVATE + benchmark::benchmark benchmark::benchmark_main + CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc) +target_compile_options(bench_gpu_cg_sync PRIVATE $<$:-O3 --expt-relaxed-constexpr>) +target_compile_definitions(bench_gpu_cg_sync PRIVATE EIGEN_USE_GPU) + +# GPU CG vs CPU CG comparison benchmark. +add_executable(bench_gpu_cg_vs_cpu bench_gpu_cg_vs_cpu.cu) +target_include_directories(bench_gpu_cg_vs_cpu PRIVATE + ${EIGEN_SOURCE_DIR} + ${CUDAToolkit_INCLUDE_DIRS}) +target_link_libraries(bench_gpu_cg_vs_cpu PRIVATE + benchmark::benchmark benchmark::benchmark_main + CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc) +target_compile_options(bench_gpu_cg_vs_cpu PRIVATE $<$:-O3 --expt-relaxed-constexpr>) +target_compile_definitions(bench_gpu_cg_vs_cpu PRIVATE EIGEN_USE_GPU) + +# Bundle Adjustment benchmark: GPU CG vs CPU CG on real BAL datasets. +add_executable(bench_gpu_ba bench_gpu_ba.cu) +target_include_directories(bench_gpu_ba PRIVATE + ${EIGEN_SOURCE_DIR} + ${CUDAToolkit_INCLUDE_DIRS}) +target_link_libraries(bench_gpu_ba PRIVATE + benchmark::benchmark + CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc) +target_compile_options(bench_gpu_ba PRIVATE $<$:-O3 --expt-relaxed-constexpr>) +target_compile_definitions(bench_gpu_ba PRIVATE EIGEN_USE_GPU) diff --git a/benchmarks/GPU/ba_results.md b/benchmarks/GPU/ba_results.md new file mode 100644 index 000000000..b6b0c2392 --- /dev/null +++ b/benchmarks/GPU/ba_results.md @@ -0,0 +1,149 @@ +# Bundle Adjustment: GPU CG vs CPU CG Results + +Benchmark of Eigen's GPU CG pipeline on normal equations arising from bundle +adjustment (BAL datasets). Compares CPU `ConjugateGradient` (Jacobi preconditioner) +against GPU CG using `DeviceMatrix` + `GpuSparseContext` + `DeviceScalar`. + +## Hardware + +- **CPU**: Intel Core i7-13700HX (Raptor Lake, 12 cores / 24 threads, single thread for Eigen CG) +- **GPU**: NVIDIA GeForce RTX 4070 Laptop GPU (Ada Lovelace, 4608 CUDA cores, 8 GB GDDR6) +- **CUDA**: 13.2 / Driver 595.79 +- **OS**: Ubuntu 24.04 (WSL2, kernel 6.6.87) + +## Software + +- Eigen: `eigen-gpu-cg` branch +- Google Benchmark 1.9.1 +- Compiler: nvcc 13.2 + g++ 13.3 +- Normal equations: H = J^T*J + I (Levenberg-Marquardt damping lambda=1.0) +- CG tolerance: 1e-8, max iterations: 10000 + +## Method + +For each BAL problem file: +1. Parse the BAL file (cameras, 3D points, 2D observations) +2. Compute the full Jacobian J using the BAL camera model (Rodrigues rotation + + perspective projection + radial distortion) with central finite differences +3. Form the normal equations H = J^T*J + lambda*I (sparse, symmetric positive definite) +4. Solve H*dx = -J^T*r using CG with Jacobi preconditioner on CPU and GPU +5. Report wall-clock time (mean of 3 repetitions) + +GPU CG uses: `GpuSparseContext` for SpMV, `DeviceMatrix` for vectors, +`DeviceScalar` with `CUBLAS_POINTER_MODE_DEVICE` for dot/norm reductions, +in-place `cwiseProduct` via NPP for Jacobi preconditioner application, +device-pointer-mode `scal` to avoid host sync on the beta update. + +## Results + +### Summary table + +| Dataset | Cameras | Points | Obs | H size | H nnz | CG iters | CPU CG (ms) | GPU CG (ms) | Speedup | +|---------|---------|--------|-----|--------|-------|----------|-------------|-------------|---------| +| Ladybug-49 | 49 | 7,776 | 31,843 | 23,769 | 1.8M | 4,421 | 4,006 | 1,152 | **3.5x** | +| Ladybug-138 | 138 | 19,878 | 85,217 | 60,876 | 4.8M | 7,008 | 21,498 | 3,553 | **6.1x** | +| Ladybug-646 | 646 | 73,584 | 327,297 | 226,566 | 18.4M | 10,000* | 123,727 | 14,268 | **8.7x** | +| Dubrovnik-356 | 356 | 226,730 | 1,255,268 | 683,394 | 69.8M | 4,308 | 216,149 | 24,493 | **8.8x** | + +\* Hit 10,000 iteration cap (poorly conditioned problem). Both CPU and GPU +hit the same cap, so timing comparison remains valid. + +### Profile breakdown (Ladybug-138, nsys) + +GPU kernel time is dominated by SpMV (91%). The remaining 9% is BLAS-1 +operations (dot, axpy, scal) and NPP element-wise ops (cwiseProduct). + +| Kernel | Time (ms) | % | Calls | +|--------|-----------|---|-------| +| cuSPARSE csrmv (SpMV) | 2507 | 91.3% | 7,006 | +| cuBLAS dot | 92 | 3.4% | 21,020 | +| cuBLAS axpy (device ptr) | 27 | 1.0% | 14,012 | +| cuSPARSE partition | 19 | 0.7% | 7,006 | +| NPP cwiseProduct | 16 + 13 | 1.1% | 14,011 + 7,006 | +| cuBLAS axpy (host ptr) | 12 | 0.5% | 7,005 | +| cuBLAS scal (device ptr) | 11 | 0.4% | 7,005 | +| NPP scalar ops | 7 | 0.2% | 7,006 | + +### Optimizations applied + +Three profiling-driven optimizations reduced GPU CG time by **1.8x** +(6.5s → 3.6s on Ladybug-138): + +1. **In-place `cwiseProduct`**: The Jacobi preconditioner apply + (`z = invdiag .* residual`) was allocating a new DeviceMatrix every + iteration. Added `z.cwiseProduct(ctx, a, b)` that reuses `z`'s buffer. + Reduced `cudaMalloc` calls from 7,053 to 23 (saving 2.3s). + +2. **`squaredNorm` via `dot(x,x)`**: cuBLAS `nrm2` uses a numerically + careful scaled-sum-of-squares algorithm (29µs/call). Replaced with + `dot(x,x)` (6.4µs/call) — 4.5x faster per call, saving ~320ms. + +3. **Device-pointer `scal`**: `p *= beta` was converting `DeviceScalar` + beta to host (triggering a stream sync), then calling host-pointer-mode + scal. Added `operator*=(DeviceScalar)` that uses device-pointer-mode + scal, eliminating one sync per iteration. Halved `cudaStreamSynchronize` + calls from 14K to 7K. + +### Observations + +1. **GPU speedup scales with problem size**: from 3.5x on small problems + (24K variables) to 8.8x on large problems (683K variables). This is + expected — larger problems have more parallelism for the GPU to exploit. + +2. **Iteration counts match**: CPU and GPU CG converge in the same number + of iterations (within 1%), confirming numerical equivalence. + +3. **Bottleneck is SpMV**: CG iteration time is dominated (91%) by the + sparse matrix-vector product on H. Further speedup requires either + faster SpMV (e.g., block-sparse formats) or algorithmic improvements + (Schur complement, better preconditioners). + +4. **Remaining overhead**: CUDA API calls (cudaMemcpyAsync for 8-byte + DeviceScalar transfers) account for ~50% of non-kernel time. Batching + multiple scalar reductions into a single transfer would help. + +5. **Jacobi preconditioner is weak for BA**: The Ladybug-646 problem does + not converge in 10K iterations. Ceres uses block Jacobi or Schur + complement preconditioners that would also benefit from GPU acceleration. + +### Scaling plot data + +``` +# n nnz_H cpu_ms gpu_ms speedup +23769 1793475 4006 1152 3.48 +60876 4791762 21498 3553 6.05 +226566 18387948 123727 14268 8.67 +683394 69827066 216149 24493 8.82 +``` + +## BAL datasets + +Downloaded from http://grail.cs.washington.edu/projects/bal/ + +| File | Source | +|------|--------| +| problem-49-7776-pre.txt | Ladybug sequence | +| problem-138-19878-pre.txt | Ladybug sequence | +| problem-646-73584-pre.txt | Ladybug sequence | +| problem-356-226730-pre.txt | Dubrovnik reconstruction | + +## Reproducing + +```bash +# Build +cmake -G Ninja -B build-bench-gpu -S benchmarks/GPU -DCMAKE_CUDA_ARCHITECTURES=89 +cmake --build build-bench-gpu --target bench_gpu_ba + +# Download BAL datasets +wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-49-7776-pre.txt.bz2 +wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-138-19878-pre.txt.bz2 +wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-646-73584-pre.txt.bz2 +wget http://grail.cs.washington.edu/projects/bal/data/dubrovnik/problem-356-226730-pre.txt.bz2 +bunzip2 *.bz2 + +# Run (one at a time) +BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3 +BAL_FILE=problem-138-19878-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3 +BAL_FILE=problem-646-73584-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3 +BAL_FILE=problem-356-226730-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3 +``` diff --git a/benchmarks/GPU/bench_gpu_ba.cu b/benchmarks/GPU/bench_gpu_ba.cu new file mode 100644 index 000000000..7278bba99 --- /dev/null +++ b/benchmarks/GPU/bench_gpu_ba.cu @@ -0,0 +1,533 @@ +// Bundle Adjustment benchmark: GPU CG vs CPU CG on real BAL datasets. +// +// Tests Eigen's GPU CG pipeline (DeviceMatrix + GpuSparseContext + DeviceScalar) +// on the normal equations (J^T*J) arising from bundle adjustment problems. +// +// Reads a BAL (Bundle Adjustment in the Large) format file, computes the +// Jacobian and residual, forms the normal equations H = J^T*J + lambda*I, +// then solves H*dx = -J^T*r with both CPU and GPU conjugate gradients. +// +// BAL format: http://grail.cs.washington.edu/projects/bal/ +// +// Usage: +// cmake --build build-bench-gpu --target bench_gpu_ba +// +// # Download a BAL dataset (bz2-compressed): +// wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-49-7776-pre.txt.bz2 +// bunzip2 problem-49-7776-pre.txt.bz2 +// +// # Run on a specific problem: +// BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba +// +// # Append results to the log: +// BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba \ +// --benchmark_format=console 2>&1 | tee -a benchmarks/GPU/ba_results.log + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +using namespace Eigen; + +// ============================================================================ +// BAL problem data +// ============================================================================ + +struct BALProblem { + int num_cameras = 0; + int num_points = 0; + int num_observations = 0; + + // Observations: (camera_idx, point_idx, observed_x, observed_y). + std::vector camera_index; + std::vector point_index; + std::vector observations_x; + std::vector observations_y; + + // Camera parameters: 9 per camera (Rodrigues r[3], translation t[3], f, k1, k2). + std::vector cameras; // [num_cameras * 9] + + // 3D points: 3 per point. + std::vector points; // [num_points * 3] + + const double* camera(int i) const { return &cameras[i * 9]; } + const double* point(int i) const { return &points[i * 3]; } + + bool load(const std::string& filename) { + std::ifstream in(filename); + if (!in) { + fprintf(stderr, "ERROR: Cannot open BAL file: %s\n", filename.c_str()); + return false; + } + + in >> num_cameras >> num_points >> num_observations; + if (!in || num_cameras <= 0 || num_points <= 0 || num_observations <= 0) { + fprintf(stderr, "ERROR: Invalid BAL header in %s\n", filename.c_str()); + return false; + } + + camera_index.resize(num_observations); + point_index.resize(num_observations); + observations_x.resize(num_observations); + observations_y.resize(num_observations); + for (int i = 0; i < num_observations; ++i) { + in >> camera_index[i] >> point_index[i] >> observations_x[i] >> observations_y[i]; + } + + cameras.resize(num_cameras * 9); + for (int i = 0; i < num_cameras * 9; ++i) { + in >> cameras[i]; + } + + points.resize(num_points * 3); + for (int i = 0; i < num_points * 3; ++i) { + in >> points[i]; + } + + if (!in) { + fprintf(stderr, "ERROR: Truncated BAL file: %s\n", filename.c_str()); + return false; + } + + fprintf(stderr, "Loaded BAL: %d cameras, %d points, %d observations\n", num_cameras, num_points, num_observations); + return true; + } +}; + +// ============================================================================ +// Camera projection model (BAL convention) +// ============================================================================ + +// Rodrigues rotation: rotate point X by axis-angle vector omega. +static void rodrigues_rotate(const double* omega, const double* X, double* result) { + double theta2 = omega[0] * omega[0] + omega[1] * omega[1] + omega[2] * omega[2]; + if (theta2 > 1e-30) { + double theta = std::sqrt(theta2); + double costh = std::cos(theta); + double sinth = std::sin(theta); + double k = (1.0 - costh) / theta2; + + // Cross product omega x X. + double wx = omega[1] * X[2] - omega[2] * X[1]; + double wy = omega[2] * X[0] - omega[0] * X[2]; + double wz = omega[0] * X[1] - omega[1] * X[0]; + + // Dot product omega . X. + double dot = omega[0] * X[0] + omega[1] * X[1] + omega[2] * X[2]; + + result[0] = X[0] * costh + wx * (sinth / theta) + omega[0] * dot * k; + result[1] = X[1] * costh + wy * (sinth / theta) + omega[1] * dot * k; + result[2] = X[2] * costh + wz * (sinth / theta) + omega[2] * dot * k; + } else { + // Small angle: R ≈ I + [omega]×. + result[0] = X[0] + omega[1] * X[2] - omega[2] * X[1]; + result[1] = X[1] + omega[2] * X[0] - omega[0] * X[2]; + result[2] = X[2] + omega[0] * X[1] - omega[1] * X[0]; + } +} + +// Project a 3D point through a camera, returning the 2D residual. +// camera: [r0,r1,r2, t0,t1,t2, f, k1, k2] +// point: [X, Y, Z] +// observed: [ox, oy] +// residual: [rx, ry] = projected - observed +static void project(const double* camera, const double* point, const double* observed, double* residual) { + // Rotate. + double P[3]; + rodrigues_rotate(camera, point, P); + + // Translate. + P[0] += camera[3]; + P[1] += camera[4]; + P[2] += camera[5]; + + // Normalize (BAL convention: negative z). + double xp = -P[0] / P[2]; + double yp = -P[1] / P[2]; + + // Radial distortion. + double r2 = xp * xp + yp * yp; + double distortion = 1.0 + camera[7] * r2 + camera[8] * r2 * r2; + + // Apply focal length. + double predicted_x = camera[6] * distortion * xp; + double predicted_y = camera[6] * distortion * yp; + + residual[0] = predicted_x - observed[0]; + residual[1] = predicted_y - observed[1]; +} + +// ============================================================================ +// Jacobian computation (numerical differentiation) +// ============================================================================ + +// Compute the 2x9 Jacobian block w.r.t. camera params and 2x3 block w.r.t. +// point coords for a single observation, using central finite differences. +static void compute_jacobian_block(const double* camera, const double* point, const double* observed, + double* J_cam, // 2x9, row-major + double* J_point) // 2x3, row-major +{ + constexpr double eps = 1e-8; + + // Camera parameters (9). + double cam_pert[9]; + std::copy(camera, camera + 9, cam_pert); + for (int j = 0; j < 9; ++j) { + double orig = cam_pert[j]; + double rp[2], rm[2]; + + cam_pert[j] = orig + eps; + project(cam_pert, point, observed, rp); + cam_pert[j] = orig - eps; + project(cam_pert, point, observed, rm); + cam_pert[j] = orig; + + J_cam[0 * 9 + j] = (rp[0] - rm[0]) / (2.0 * eps); + J_cam[1 * 9 + j] = (rp[1] - rm[1]) / (2.0 * eps); + } + + // Point coordinates (3). + double pt_pert[3]; + std::copy(point, point + 3, pt_pert); + for (int j = 0; j < 3; ++j) { + double orig = pt_pert[j]; + double rp[2], rm[2]; + + pt_pert[j] = orig + eps; + project(camera, pt_pert, observed, rp); + pt_pert[j] = orig - eps; + project(camera, pt_pert, observed, rm); + pt_pert[j] = orig; + + J_point[0 * 3 + j] = (rp[0] - rm[0]) / (2.0 * eps); + J_point[1 * 3 + j] = (rp[1] - rm[1]) / (2.0 * eps); + } +} + +// ============================================================================ +// Build normal equations: H = J^T*J + lambda*I, g = -J^T*r +// ============================================================================ + +struct NormalEquations { + SparseMatrix H; + VectorXd g; + VectorXd residual; + double residual_norm; + int jacobian_rows; + int jacobian_cols; + long jacobian_nnz; +}; + +static NormalEquations build_normal_equations(const BALProblem& problem, double lambda = 1.0) { + const int num_cam_params = problem.num_cameras * 9; + const int num_pt_params = problem.num_points * 3; + const int num_params = num_cam_params + num_pt_params; + const int num_residuals = problem.num_observations * 2; + + fprintf(stderr, "Building Jacobian: %d x %d, %ld nonzeros\n", num_residuals, num_params, + (long)problem.num_observations * 24); + + // Build J as a triplet list. + using Triplet = Eigen::Triplet; + std::vector triplets; + triplets.reserve(problem.num_observations * 24); // 2 rows × 12 nonzeros = 24 entries per obs + + VectorXd residual(num_residuals); + + for (int obs = 0; obs < problem.num_observations; ++obs) { + int ci = problem.camera_index[obs]; + int pi = problem.point_index[obs]; + double observed[2] = {problem.observations_x[obs], problem.observations_y[obs]}; + + // Compute residual. + double r[2]; + project(problem.camera(ci), problem.point(pi), observed, r); + residual[obs * 2 + 0] = r[0]; + residual[obs * 2 + 1] = r[1]; + + // Compute Jacobian blocks. + double J_cam[18], J_pt[6]; // 2x9 and 2x3 + compute_jacobian_block(problem.camera(ci), problem.point(pi), observed, J_cam, J_pt); + + // Insert camera block: rows [2*obs, 2*obs+1], cols [9*ci, 9*ci+8]. + for (int row = 0; row < 2; ++row) { + for (int col = 0; col < 9; ++col) { + double val = J_cam[row * 9 + col]; + if (val != 0.0) { + triplets.emplace_back(obs * 2 + row, ci * 9 + col, val); + } + } + } + + // Insert point block: rows [2*obs, 2*obs+1], cols [num_cam_params + 3*pi, ...]. + for (int row = 0; row < 2; ++row) { + for (int col = 0; col < 3; ++col) { + double val = J_pt[row * 3 + col]; + if (val != 0.0) { + triplets.emplace_back(obs * 2 + row, num_cam_params + pi * 3 + col, val); + } + } + } + } + + // Build sparse Jacobian. + SparseMatrix J(num_residuals, num_params); + J.setFromTriplets(triplets.begin(), triplets.end()); + + fprintf(stderr, "Jacobian: %dx%d, nnz=%ld\n", (int)J.rows(), (int)J.cols(), (long)J.nonZeros()); + + // Form normal equations: H = J^T*J + lambda*I. + SparseMatrix H = (J.transpose() * J).pruned(); + + // Add Levenberg-Marquardt damping. + for (int i = 0; i < num_params; ++i) { + H.coeffRef(i, i) += lambda; + } + H.makeCompressed(); + + // Gradient: g = -J^T * r. + VectorXd g = -(J.transpose() * residual); + + double rnorm = residual.norm(); + fprintf(stderr, "Normal equations: H is %dx%d, nnz=%ld, |r|=%.6e\n", (int)H.rows(), (int)H.cols(), (long)H.nonZeros(), + rnorm); + + return {std::move(H), std::move(g), std::move(residual), rnorm, num_residuals, num_params, (long)J.nonZeros()}; +} + +// ============================================================================ +// Global problem state (loaded once before benchmarks run) +// ============================================================================ + +static BALProblem g_problem; +static NormalEquations g_neq; +static bool g_loaded = false; + +static void ensure_loaded() { + if (g_loaded) return; + + const char* bal_file = std::getenv("BAL_FILE"); + if (!bal_file) { + fprintf(stderr, + "ERROR: Set BAL_FILE environment variable to a BAL problem file.\n" + " Download from: http://grail.cs.washington.edu/projects/bal/\n" + " Example:\n" + " wget http://grail.cs.washington.edu/projects/bal/data/ladybug/" + "problem-49-7776-pre.txt.bz2\n" + " bunzip2 problem-49-7776-pre.txt.bz2\n" + " BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba\n"); + std::exit(1); + } + + if (!g_problem.load(bal_file)) { + std::exit(1); + } + + g_neq = build_normal_equations(g_problem); + g_loaded = true; +} + +// ============================================================================ +// CPU CG benchmark +// ============================================================================ + +static void BM_BA_CPU_CG(benchmark::State& state) { + ensure_loaded(); + const auto& H = g_neq.H; + const auto& g = g_neq.g; + + ConjugateGradient, Lower | Upper> cg; + cg.setMaxIterations(10000); + cg.setTolerance(1e-8); + cg.compute(H); + + int last_iters = 0; + double last_error = 0; + for (auto _ : state) { + VectorXd dx = cg.solve(g); + benchmark::DoNotOptimize(dx.data()); + last_iters = cg.iterations(); + last_error = cg.error(); + } + + state.counters["n"] = H.rows(); + state.counters["nnz"] = H.nonZeros(); + state.counters["iters"] = last_iters; + state.counters["error"] = last_error; + state.counters["cameras"] = g_problem.num_cameras; + state.counters["points"] = g_problem.num_points; + state.counters["observations"] = g_problem.num_observations; +} + +// ============================================================================ +// GPU CG benchmark (with Jacobi preconditioner) +// ============================================================================ + +static void cuda_warmup() { + static bool done = false; + if (!done) { + void* p; + cudaMalloc(&p, 1); + cudaFree(p); + done = true; + } +} + +static void BM_BA_GPU_CG(benchmark::State& state) { + ensure_loaded(); + cuda_warmup(); + + const auto& H = g_neq.H; + const auto& g = g_neq.g; + const Index n = H.rows(); + + // Extract inverse diagonal (Jacobi preconditioner). + using SpMat = SparseMatrix; + VectorXd invdiag(n); + for (Index j = 0; j < H.outerSize(); ++j) { + SpMat::InnerIterator it(H, j); + while (it && it.index() != j) ++it; + if (it && it.index() == j && it.value() != 0.0) + invdiag(j) = 1.0 / it.value(); + else + invdiag(j) = 1.0; + } + + // Set up GPU context and upload data. + GpuContext ctx; + GpuContext::setThreadLocal(&ctx); + GpuSparseContext spmv_ctx(ctx); + auto mat = spmv_ctx.deviceView(H); + auto d_invdiag = DeviceMatrix::fromHost(invdiag, ctx.stream()); + auto d_g = DeviceMatrix::fromHost(g, ctx.stream()); + + int last_iters = 0; + double last_error = 0; + + for (auto _ : state) { + DeviceMatrix d_x(n, 1); + d_x.setZero(ctx); + DeviceMatrix residual(n, 1); + residual.copyFrom(ctx, d_g); + + double rhsNorm2 = d_g.squaredNorm(ctx); + double threshold = 1e-8 * 1e-8 * rhsNorm2; + double residualNorm2 = residual.squaredNorm(ctx); + + DeviceMatrix p = d_invdiag.cwiseProduct(ctx, residual); + DeviceMatrix z(n, 1), tmp(n, 1); + + auto absNew = residual.dot(ctx, p); + Index i = 0; + Index maxIters = 10000; + while (i < maxIters) { + tmp.noalias() = mat * p; + auto alpha = absNew / p.dot(ctx, tmp); + d_x += alpha * p; + residual -= alpha * tmp; + + residualNorm2 = residual.squaredNorm(ctx); + if (residualNorm2 < threshold) break; + + z.cwiseProduct(ctx, d_invdiag, residual); // in-place, no allocation + auto absOld = std::move(absNew); + absNew = residual.dot(ctx, z); + auto beta = absNew / absOld; + + p *= beta; // device-pointer scal, no host sync + p += z; + i++; + } + benchmark::DoNotOptimize(d_x.data()); + last_iters = i; + last_error = std::sqrt(residualNorm2 / rhsNorm2); + } + + GpuContext::setThreadLocal(nullptr); + + state.counters["n"] = n; + state.counters["nnz"] = H.nonZeros(); + state.counters["iters"] = last_iters; + state.counters["error"] = last_error; + state.counters["cameras"] = g_problem.num_cameras; + state.counters["points"] = g_problem.num_points; + state.counters["observations"] = g_problem.num_observations; +} + +// ============================================================================ +// CPU CG with Jacobi preconditioner (apples-to-apples comparison) +// ============================================================================ + +static void BM_BA_CPU_CG_Jacobi(benchmark::State& state) { + ensure_loaded(); + const auto& H = g_neq.H; + const auto& g = g_neq.g; + + // Eigen's DiagonalPreconditioner is effectively Jacobi. + ConjugateGradient, Lower | Upper> cg; + cg.setMaxIterations(10000); + cg.setTolerance(1e-8); + cg.compute(H); + + int last_iters = 0; + double last_error = 0; + for (auto _ : state) { + VectorXd dx = cg.solve(g); + benchmark::DoNotOptimize(dx.data()); + last_iters = cg.iterations(); + last_error = cg.error(); + } + + state.counters["n"] = H.rows(); + state.counters["nnz"] = H.nonZeros(); + state.counters["iters"] = last_iters; + state.counters["error"] = last_error; +} + +// ============================================================================ +// Register benchmarks +// ============================================================================ + +BENCHMARK(BM_BA_CPU_CG)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_BA_CPU_CG_Jacobi)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_BA_GPU_CG)->Unit(benchmark::kMillisecond); + +// ============================================================================ +// Custom main: print summary after benchmarks +// ============================================================================ + +int main(int argc, char** argv) { + benchmark::Initialize(&argc, argv); + + // Print problem info before benchmarks. + const char* bal_file = std::getenv("BAL_FILE"); + if (bal_file) { + ensure_loaded(); + fprintf(stderr, + "\n" + "=== Bundle Adjustment GPU CG Benchmark ===\n" + "BAL file: %s\n" + "Cameras: %d\n" + "Points: %d\n" + "Observations: %d\n" + "J size: %d x %d, nnz=%ld\n" + "H size: %d x %d, nnz=%ld\n" + "|residual|: %.6e\n" + "==========================================\n\n", + bal_file, g_problem.num_cameras, g_problem.num_points, g_problem.num_observations, g_neq.jacobian_rows, + g_neq.jacobian_cols, g_neq.jacobian_nnz, (int)g_neq.H.rows(), (int)g_neq.H.cols(), (long)g_neq.H.nonZeros(), + g_neq.residual_norm); + } + + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); + return 0; +} diff --git a/benchmarks/GPU/bench_gpu_cg_sync.cu b/benchmarks/GPU/bench_gpu_cg_sync.cu new file mode 100644 index 000000000..1ec33e188 --- /dev/null +++ b/benchmarks/GPU/bench_gpu_cg_sync.cu @@ -0,0 +1,291 @@ +// Benchmark: GPU Conjugate Gradient via DeviceMatrix operators. +// +// Shows the path to running Eigen's CG on GPU with minimal code changes. +// The DeviceMatrix benchmark mirrors Eigen's conjugate_gradient() line-by-line. +// A raw cuBLAS device-pointer-mode implementation is included as a lower bound. +// +// The only change needed in Eigen's CG template to support DeviceMatrix: +// Line 34: typedef Dest VectorType; (instead of Matrix) +// +// Usage: +// cmake --build build-bench-gpu --target bench_gpu_cg_sync +// ./build-bench-gpu/bench_gpu_cg_sync + +#include + +#include +#include +#include + +using namespace Eigen; + +using Scalar = double; +using RealScalar = double; +using Vec = Matrix; +using SpMat = SparseMatrix; + +static SpMat make_spd(Index n) { + SpMat A(n, n); + A.reserve(VectorXi::Constant(n, 3)); + for (Index i = 0; i < n; ++i) { + A.insert(i, i) = 4.0; + if (i > 0) A.insert(i, i - 1) = -1.0; + if (i < n - 1) A.insert(i, i + 1) = -1.0; + } + A.makeCompressed(); + return A; +} + +static void cuda_warmup() { + static bool done = false; + if (!done) { + void* p; + cudaMalloc(&p, 1); + cudaFree(p); + done = true; + } +} + +// ========================================================================== +// GPU CG using DeviceMatrix operators — mirrors Eigen's conjugate_gradient() +// ========================================================================== +// +// Compare with Eigen/src/IterativeLinearSolvers/ConjugateGradient.h lines 29-84. +// Left column: Eigen CG code. Right column: this benchmark. +// +// Eigen CG GPU CG (this benchmark) +// -------- ----------------------- +// VectorType residual = rhs - mat * x; residual.copyFrom(ctx, rhs); [x=0 so r=b] +// RealScalar rhsNorm2 = rhs.sqNorm(); RealScalar rhsNorm2 = rhs.squaredNorm(); +// ... +// tmp.noalias() = mat * p; tmp.noalias() = mat * p; [identical] +// Scalar alpha = absNew / p.dot(tmp); Scalar alpha = absNew / p.dot(tmp); [identical] +// x += alpha * p; x += alpha * p; [identical] +// residual -= alpha * tmp; residual -= alpha * tmp; [identical] +// residualNorm2 = residual.sqNorm(); residualNorm2 = residual.squaredNorm(); [identical] +// ... +// p = z + beta * p; p *= beta; p += z; [equivalent, no alloc] + +static void BM_CG_DeviceMatrixOps(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + + SpMat A = make_spd(n); + Vec b = Vec::Random(n); + + // One shared context: SpMV + BLAS-1 on same stream, zero event overhead. + GpuContext ctx; + GpuContext::setThreadLocal(&ctx); + GpuSparseContext spmv(ctx); + auto mat = spmv.deviceView(A); + + // Upload RHS once. + auto rhs = DeviceMatrix::fromHost(b, ctx.stream()); + + for (auto _ : state) { + // --- Eigen CG lines 34-63: initialization --- + // typedef Dest VectorType; // GPU CHANGE: was Matrix + // VectorType residual = rhs - mat * x; // x=0, so residual = rhs + DeviceMatrix x(n, 1); + x.setZero(); + DeviceMatrix residual(n, 1); + residual.copyFrom(ctx, rhs); + + // RealScalar rhsNorm2 = rhs.squaredNorm(); + RealScalar rhsNorm2 = rhs.squaredNorm(); + if (rhsNorm2 == 0) continue; + + RealScalar tol = 1e-10; + const RealScalar considerAsZero = (std::numeric_limits::min)(); + RealScalar threshold = numext::maxi(RealScalar(tol * tol * rhsNorm2), considerAsZero); + + // RealScalar residualNorm2 = residual.squaredNorm(); + RealScalar residualNorm2 = residual.squaredNorm(); + if (residualNorm2 < threshold) continue; + + // VectorType p(n); + // p = precond.solve(residual); // no preconditioner: p = residual + DeviceMatrix p(n, 1); + p.copyFrom(ctx, residual); + + // VectorType z(n), tmp(n); + DeviceMatrix z(n, 1), tmp(n, 1); + + // auto absNew = numext::real(residual.dot(p)); + // DeviceScalar — stays on device, no sync. + auto absNew = residual.dot(p); // DeviceScalar, no sync + + // while (i < maxIters) { + Index maxIters = 200; + Index i = 0; + while (i < maxIters) { + // tmp.noalias() = mat * p; + tmp.noalias() = mat * p; // SpMV, device-resident + + // auto alpha = absNew / p.dot(tmp); + // DeviceScalar / DeviceScalar → device kernel, no sync! + auto alpha = absNew / p.dot(tmp); // DeviceScalar, no sync + + // x += alpha * p; + // DeviceScalar * DeviceMatrix → device-pointer axpy, no sync! + x += alpha * p; + + // residual -= alpha * tmp; + residual -= alpha * tmp; // device-pointer axpy, no sync + + // residualNorm2 = residual.squaredNorm(); + residualNorm2 = residual.squaredNorm(); // THE one sync per iteration + + // if (residualNorm2 < threshold) break; + if (residualNorm2 < threshold) break; + + // z = precond.solve(residual); + z.copyFrom(ctx, residual); // no preconditioner + + // auto absOld = std::move(absNew); + auto absOld = std::move(absNew); // no sync, no alloc + + // absNew = numext::real(residual.dot(z)); + absNew = residual.dot(z); // DeviceScalar, no sync + + // auto beta = absNew / absOld; + // DeviceScalar / DeviceScalar → device kernel, no sync! + auto beta = absNew / absOld; // DeviceScalar, no sync + + // p = z + beta * p; + p *= beta; // device-pointer scal, no host sync + p += z; + + i++; + } + } + + GpuContext::setThreadLocal(nullptr); + state.SetItemsProcessed(state.iterations() * 200); +} + +BENCHMARK(BM_CG_DeviceMatrixOps)->RangeMultiplier(4)->Range(1 << 10, 1 << 20); + +// ========================================================================== +// Raw cuBLAS device-pointer-mode CG (1 sync/iter) — performance lower bound +// ========================================================================== + +__global__ void scalar_div_kernel(const Scalar* a, const Scalar* b, Scalar* out) { *out = *a / *b; } +__global__ void scalar_neg_kernel(const Scalar* in, Scalar* out) { *out = -(*in); } + +static void BM_CG_DevicePointerMode(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const int maxIters = 200; + + SpMat A = make_spd(n); + Vec b = Vec::Random(n); + + cudaStream_t stream; + cudaStreamCreate(&stream); + cublasHandle_t cublas; + cublasCreate(&cublas); + cublasSetStream(cublas, stream); + + cusparseHandle_t cusparse; + cusparseCreate(&cusparse); + cusparseSetStream(cusparse, stream); + + internal::DeviceBuffer d_outer((n + 1) * sizeof(int)); + internal::DeviceBuffer d_inner(A.nonZeros() * sizeof(int)); + internal::DeviceBuffer d_vals(A.nonZeros() * sizeof(Scalar)); + cudaMemcpy(d_outer.ptr, A.outerIndexPtr(), (n + 1) * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_inner.ptr, A.innerIndexPtr(), A.nonZeros() * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_vals.ptr, A.valuePtr(), A.nonZeros() * sizeof(Scalar), cudaMemcpyHostToDevice); + + cusparseSpMatDescr_t matA; + cusparseCreateCsc(&matA, n, n, A.nonZeros(), d_outer.ptr, d_inner.ptr, d_vals.ptr, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F); + + internal::DeviceBuffer d_tmp_buf(n * sizeof(Scalar)); + cusparseDnVecDescr_t tmp_x, tmp_y; + cusparseCreateDnVec(&tmp_x, n, d_tmp_buf.ptr, CUDA_R_64F); + cusparseCreateDnVec(&tmp_y, n, d_tmp_buf.ptr, CUDA_R_64F); + Scalar spmv_alpha = 1.0, spmv_beta = 0.0; + size_t ws_size = 0; + cusparseSpMV_bufferSize(cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &spmv_alpha, matA, tmp_x, &spmv_beta, tmp_y, + CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &ws_size); + internal::DeviceBuffer d_workspace(ws_size); + cusparseDestroyDnVec(tmp_x); + cusparseDestroyDnVec(tmp_y); + + internal::DeviceBuffer d_x(n * sizeof(Scalar)), d_r(n * sizeof(Scalar)); + internal::DeviceBuffer d_p(n * sizeof(Scalar)), d_tmp(n * sizeof(Scalar)); + internal::DeviceBuffer d_b(n * sizeof(Scalar)); + internal::DeviceBuffer d_absNew(sizeof(Scalar)), d_absOld(sizeof(Scalar)); + internal::DeviceBuffer d_pdot(sizeof(Scalar)), d_alpha(sizeof(Scalar)); + internal::DeviceBuffer d_neg_alpha(sizeof(Scalar)), d_beta(sizeof(Scalar)); + internal::DeviceBuffer d_rnorm(sizeof(RealScalar)); + + cudaMemcpy(d_b.ptr, b.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice); + + auto spmv = [&](Scalar* x_ptr, Scalar* y_ptr) { + cusparseDnVecDescr_t vx, vy; + cusparseCreateDnVec(&vx, n, x_ptr, CUDA_R_64F); + cusparseCreateDnVec(&vy, n, y_ptr, CUDA_R_64F); + cusparseSpMV(cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &spmv_alpha, matA, vx, &spmv_beta, vy, CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, d_workspace.ptr); + cusparseDestroyDnVec(vx); + cusparseDestroyDnVec(vy); + }; + + for (auto _ : state) { + cudaMemsetAsync(static_cast(d_x.ptr), 0, n * sizeof(Scalar), stream); + cudaMemcpyAsync(d_r.ptr, d_b.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToDevice, stream); + cudaMemcpyAsync(d_p.ptr, d_b.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToDevice, stream); + + cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_DEVICE); + cublasDdot(cublas, n, static_cast(d_r.ptr), 1, static_cast(d_p.ptr), 1, + static_cast(d_absNew.ptr)); + + for (int i = 0; i < maxIters; ++i) { + spmv(static_cast(d_p.ptr), static_cast(d_tmp.ptr)); + + cublasDdot(cublas, n, static_cast(d_p.ptr), 1, static_cast(d_tmp.ptr), 1, + static_cast(d_pdot.ptr)); + + scalar_div_kernel<<<1, 1, 0, stream>>>(static_cast(d_absNew.ptr), static_cast(d_pdot.ptr), + static_cast(d_alpha.ptr)); + scalar_neg_kernel<<<1, 1, 0, stream>>>(static_cast(d_alpha.ptr), static_cast(d_neg_alpha.ptr)); + + cublasDaxpy(cublas, n, static_cast(d_alpha.ptr), static_cast(d_p.ptr), 1, + static_cast(d_x.ptr), 1); + cublasDaxpy(cublas, n, static_cast(d_neg_alpha.ptr), static_cast(d_tmp.ptr), 1, + static_cast(d_r.ptr), 1); + + cublasDnrm2(cublas, n, static_cast(d_r.ptr), 1, static_cast(d_rnorm.ptr)); + + RealScalar rnorm; + cudaMemcpyAsync(&rnorm, d_rnorm.ptr, sizeof(RealScalar), cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + if (rnorm * rnorm < 1e-20) break; + + cudaMemcpyAsync(d_absOld.ptr, d_absNew.ptr, sizeof(Scalar), cudaMemcpyDeviceToDevice, stream); + cublasDdot(cublas, n, static_cast(d_r.ptr), 1, static_cast(d_r.ptr), 1, + static_cast(d_absNew.ptr)); + + scalar_div_kernel<<<1, 1, 0, stream>>>(static_cast(d_absNew.ptr), static_cast(d_absOld.ptr), + static_cast(d_beta.ptr)); + + cublasDscal(cublas, n, static_cast(d_beta.ptr), static_cast(d_p.ptr), 1); + cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_HOST); + Scalar one = 1.0; + cublasDaxpy(cublas, n, &one, static_cast(d_r.ptr), 1, static_cast(d_p.ptr), 1); + cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_DEVICE); + } + cudaStreamSynchronize(stream); + } + + state.SetItemsProcessed(state.iterations() * maxIters); + cusparseDestroySpMat(matA); + cusparseDestroy(cusparse); + cublasDestroy(cublas); + cudaStreamDestroy(stream); +} + +BENCHMARK(BM_CG_DevicePointerMode)->RangeMultiplier(4)->Range(1 << 10, 1 << 20); diff --git a/benchmarks/GPU/bench_gpu_cg_vs_cpu.cu b/benchmarks/GPU/bench_gpu_cg_vs_cpu.cu new file mode 100644 index 000000000..b1d9a4b81 --- /dev/null +++ b/benchmarks/GPU/bench_gpu_cg_vs_cpu.cu @@ -0,0 +1,216 @@ +// Benchmark: GPU CG vs CPU CG on realistic sparse systems. +// +// Tests 2D Laplacian (5-point stencil) and 3D Laplacian (7-point stencil) +// in both float and double precision. +// +// Usage: +// cmake --build build-bench-gpu --target bench_gpu_cg_vs_cpu +// ./build-bench-gpu/bench_gpu_cg_vs_cpu + +#include + +#include +#include +#include + +using namespace Eigen; + +// ---- Sparse matrix generators ----------------------------------------------- + +template +SparseMatrix make_laplacian_2d(int grid_n) { + using SpMat = SparseMatrix; + const int n = grid_n * grid_n; + SpMat A(n, n); + A.reserve(VectorXi::Constant(n, 5)); + for (int i = 0; i < grid_n; ++i) { + for (int j = 0; j < grid_n; ++j) { + int idx = i * grid_n + j; + A.insert(idx, idx) = Scalar(4); + if (i > 0) A.insert(idx, idx - grid_n) = Scalar(-1); + if (i < grid_n - 1) A.insert(idx, idx + grid_n) = Scalar(-1); + if (j > 0) A.insert(idx, idx - 1) = Scalar(-1); + if (j < grid_n - 1) A.insert(idx, idx + 1) = Scalar(-1); + } + } + A.makeCompressed(); + return A; +} + +template +SparseMatrix make_laplacian_3d(int grid_n) { + using SpMat = SparseMatrix; + const int n = grid_n * grid_n * grid_n; + const int n2 = grid_n * grid_n; + SpMat A(n, n); + A.reserve(VectorXi::Constant(n, 7)); + for (int i = 0; i < grid_n; ++i) { + for (int j = 0; j < grid_n; ++j) { + for (int k = 0; k < grid_n; ++k) { + int idx = i * n2 + j * grid_n + k; + A.insert(idx, idx) = Scalar(6); + if (i > 0) A.insert(idx, idx - n2) = Scalar(-1); + if (i < grid_n - 1) A.insert(idx, idx + n2) = Scalar(-1); + if (j > 0) A.insert(idx, idx - grid_n) = Scalar(-1); + if (j < grid_n - 1) A.insert(idx, idx + grid_n) = Scalar(-1); + if (k > 0) A.insert(idx, idx - 1) = Scalar(-1); + if (k < grid_n - 1) A.insert(idx, idx + 1) = Scalar(-1); + } + } + } + A.makeCompressed(); + return A; +} + +static void cuda_warmup() { + static bool done = false; + if (!done) { + void* p; + cudaMalloc(&p, 1); + cudaFree(p); + done = true; + } +} + +// ---- CPU CG ----------------------------------------------------------------- + +template +void run_cpu_cg(benchmark::State& state, MatGen make_matrix) { + using SpMat = SparseMatrix; + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + const int grid_n = state.range(0); + SpMat A = make_matrix(grid_n); + Vec b = Vec::Random(A.rows()); + + ConjugateGradient cg; + cg.setMaxIterations(10000); + cg.setTolerance(RealScalar(1e-8)); + cg.compute(A); + + int last_iters = 0; + for (auto _ : state) { + Vec x = cg.solve(b); + benchmark::DoNotOptimize(x.data()); + last_iters = cg.iterations(); + } + state.counters["n"] = A.rows(); + state.counters["nnz"] = A.nonZeros(); + state.counters["iters"] = last_iters; + state.counters["error"] = cg.error(); +} + +// ---- GPU CG ----------------------------------------------------------------- + +template +void run_gpu_cg(benchmark::State& state, MatGen make_matrix) { + using SpMat = SparseMatrix; + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + cuda_warmup(); + const int grid_n = state.range(0); + SpMat A = make_matrix(grid_n); + const Index n = A.rows(); + Vec b = Vec::Random(n); + + // Extract inverse diagonal. + Vec invdiag(n); + for (Index j = 0; j < A.outerSize(); ++j) { + typename SpMat::InnerIterator it(A, j); + while (it && it.index() != j) ++it; + if (it && it.index() == j && it.value() != Scalar(0)) + invdiag(j) = Scalar(1) / it.value(); + else + invdiag(j) = Scalar(1); + } + + GpuContext ctx; + GpuContext::setThreadLocal(&ctx); + GpuSparseContext spmv_ctx(ctx); + auto mat = spmv_ctx.deviceView(A); + auto d_invdiag = DeviceMatrix::fromHost(invdiag, ctx.stream()); + auto d_b = DeviceMatrix::fromHost(b, ctx.stream()); + + int last_iters = 0; + RealScalar last_error = 0; + + for (auto _ : state) { + DeviceMatrix d_x(n, 1); + d_x.setZero(ctx); + DeviceMatrix residual(n, 1); + residual.copyFrom(ctx, d_b); + + RealScalar rhsNorm2 = d_b.squaredNorm(ctx); + RealScalar tol = RealScalar(1e-8); + RealScalar threshold = tol * tol * rhsNorm2; + RealScalar residualNorm2 = residual.squaredNorm(ctx); + + DeviceMatrix p = d_invdiag.cwiseProduct(ctx, residual); + DeviceMatrix z(n, 1), tmp(n, 1); + + auto absNew = residual.dot(ctx, p); + Index i = 0; + Index maxIters = 10000; + while (i < maxIters) { + tmp.noalias() = mat * p; + auto alpha = absNew / p.dot(ctx, tmp); + d_x += alpha * p; + residual -= alpha * tmp; + + residualNorm2 = residual.squaredNorm(ctx); + if (residualNorm2 < threshold) break; + + z.cwiseProduct(ctx, d_invdiag, residual); + auto absOld = std::move(absNew); + absNew = residual.dot(ctx, z); + auto beta = absNew / absOld; + + p *= beta; + p += z; + i++; + } + benchmark::DoNotOptimize(d_x.data()); + last_iters = i; + last_error = numext::sqrt(residualNorm2 / rhsNorm2); + } + + GpuContext::setThreadLocal(nullptr); + state.counters["n"] = n; + state.counters["nnz"] = A.nonZeros(); + state.counters["iters"] = last_iters; + state.counters["error"] = last_error; +} + +// ---- 2D Laplacian, double --------------------------------------------------- + +static void BM_CG_CPU_2D_double(benchmark::State& state) { run_cpu_cg(state, make_laplacian_2d); } +static void BM_CG_GPU_2D_double(benchmark::State& state) { run_gpu_cg(state, make_laplacian_2d); } + +BENCHMARK(BM_CG_CPU_2D_double)->ArgsProduct({{32, 64, 128, 256, 512}}); +BENCHMARK(BM_CG_GPU_2D_double)->ArgsProduct({{32, 64, 128, 256, 512}}); + +// ---- 2D Laplacian, float ---------------------------------------------------- + +static void BM_CG_CPU_2D_float(benchmark::State& state) { run_cpu_cg(state, make_laplacian_2d); } +static void BM_CG_GPU_2D_float(benchmark::State& state) { run_gpu_cg(state, make_laplacian_2d); } + +BENCHMARK(BM_CG_CPU_2D_float)->ArgsProduct({{32, 64, 128, 256, 512}}); +BENCHMARK(BM_CG_GPU_2D_float)->ArgsProduct({{32, 64, 128, 256, 512}}); + +// ---- 3D Laplacian, double --------------------------------------------------- + +static void BM_CG_CPU_3D_double(benchmark::State& state) { run_cpu_cg(state, make_laplacian_3d); } +static void BM_CG_GPU_3D_double(benchmark::State& state) { run_gpu_cg(state, make_laplacian_3d); } + +BENCHMARK(BM_CG_CPU_3D_double)->ArgsProduct({{16, 32, 48, 64}}); +BENCHMARK(BM_CG_GPU_3D_double)->ArgsProduct({{16, 32, 48, 64}}); + +// ---- 3D Laplacian, float ---------------------------------------------------- + +static void BM_CG_CPU_3D_float(benchmark::State& state) { run_cpu_cg(state, make_laplacian_3d); } +static void BM_CG_GPU_3D_float(benchmark::State& state) { run_gpu_cg(state, make_laplacian_3d); } + +BENCHMARK(BM_CG_CPU_3D_float)->ArgsProduct({{16, 32, 48, 64}}); +BENCHMARK(BM_CG_GPU_3D_float)->ArgsProduct({{16, 32, 48, 64}}); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 33615c2aa..9b2c3d53d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -481,13 +481,13 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) ei_add_test(gpu_basic) ei_add_test(gpu_library_example "" "CUDA::cusolver") - # DeviceMatrix tests: only CUDA runtime, no NVIDIA libraries. + # DeviceMatrix tests: CUDA runtime + cuBLAS + cuSOLVER (for BLAS-1 ops via GpuContext). unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) add_executable(gpu_device_matrix gpu_device_matrix.cpp) target_include_directories(gpu_device_matrix PRIVATE "${CUDA_TOOLKIT_ROOT_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}") - target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart) + target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart CUDA::cublas CUDA::cusolver CUDA::npps CUDA::nppc) target_compile_definitions(gpu_device_matrix PRIVATE EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE} EIGEN_TEST_PART_ALL=1) @@ -575,7 +575,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) "${CUDA_TOOLKIT_ROOT_DIR}/include" "${CMAKE_CURRENT_BINARY_DIR}") target_link_libraries(gpu_cusparse_spmv - Eigen3::Eigen CUDA::cudart CUDA::cusparse) + Eigen3::Eigen CUDA::cudart CUDA::cusparse CUDA::cublas CUDA::cusolver) target_compile_definitions(gpu_cusparse_spmv PRIVATE EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE} EIGEN_TEST_PART_ALL=1) @@ -584,6 +584,23 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) add_dependencies(buildtests_gpu gpu_cusparse_spmv) set_property(TEST gpu_cusparse_spmv APPEND PROPERTY LABELS "Official;gpu") set_property(TEST gpu_cusparse_spmv PROPERTY SKIP_RETURN_CODE 77) + + # End-to-end GPU CG test: Eigen's ConjugateGradient with DeviceMatrix. + add_executable(gpu_cg gpu_cg.cpp) + target_include_directories(gpu_cg PRIVATE + "${CUDA_TOOLKIT_ROOT_DIR}/include" + "${CMAKE_CURRENT_BINARY_DIR}") + target_link_libraries(gpu_cg + Eigen3::Eigen CUDA::cudart CUDA::cusparse CUDA::cublas CUDA::cusolver CUDA::npps CUDA::nppc) + target_compile_definitions(gpu_cg PRIVATE + EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE} + EIGEN_TEST_PART_ALL=1) + add_test(NAME gpu_cg COMMAND gpu_cg) + add_dependencies(buildtests gpu_cg) + add_dependencies(buildtests_gpu gpu_cg) + set_property(TEST gpu_cg APPEND PROPERTY LABELS "Official;gpu") + set_property(TEST gpu_cg PROPERTY SKIP_RETURN_CODE 77) + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") endif() diff --git a/test/gpu_cg.cpp b/test/gpu_cg.cpp new file mode 100644 index 000000000..af62e3ae0 --- /dev/null +++ b/test/gpu_cg.cpp @@ -0,0 +1,224 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// End-to-end test: CG algorithm running on GPU via DeviceMatrix. +// +// Uses DeviceSparseView for SpMV, DeviceMatrix for vectors, DeviceScalar +// for deferred reductions. Verifies correctness against CPU ConjugateGradient. + +#define EIGEN_USE_GPU +#include "main.h" +#include +#include +#include + +using namespace Eigen; + +// ---- Helper: build a sparse SPD matrix -------------------------------------- + +template +SparseMatrix make_spd(Index n, double density = 0.1) { + using SpMat = SparseMatrix; + using RealScalar = typename NumTraits::Real; + + SpMat R(n, n); + R.reserve(VectorXi::Constant(n, static_cast(n * density) + 1)); + for (Index j = 0; j < n; ++j) { + for (Index i = 0; i < n; ++i) { + if (i == j || (std::rand() / double(RAND_MAX)) < density) { + R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5); + } + } + } + R.makeCompressed(); + SpMat A = R.adjoint() * R; + for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n)); + A.makeCompressed(); + return A; +} + +// ---- GPU CG without preconditioner ------------------------------------------ + +template +void test_gpu_cg(Index n) { + using SpMat = SparseMatrix; + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + SpMat A = make_spd(n); + Vec b = Vec::Random(n); + + // CPU reference (identity preconditioner to match GPU). + ConjugateGradient cpu_cg; + cpu_cg.setMaxIterations(1000); + cpu_cg.setTolerance(RealScalar(1e-8)); + cpu_cg.compute(A); + Vec x_cpu = cpu_cg.solve(b); + VERIFY_IS_EQUAL(cpu_cg.info(), Success); + + // GPU CG: mirrors Eigen's conjugate_gradient() using DeviceMatrix ops. + GpuContext ctx; + GpuContext::setThreadLocal(&ctx); + GpuSparseContext spmv_ctx(ctx); + auto mat = spmv_ctx.deviceView(A); + + auto d_b = DeviceMatrix::fromHost(b, ctx.stream()); + DeviceMatrix d_x(n, 1); + d_x.setZero(ctx); + + // r = b (since x=0) + DeviceMatrix residual(n, 1); + residual.copyFrom(ctx, d_b); + + RealScalar rhsNorm2 = d_b.squaredNorm(ctx); + RealScalar tol = RealScalar(1e-8); + RealScalar threshold = tol * tol * rhsNorm2; + RealScalar residualNorm2 = residual.squaredNorm(ctx); + + // p = r (no preconditioner) + DeviceMatrix p(n, 1); + p.copyFrom(ctx, residual); + DeviceMatrix z(n, 1), tmp(n, 1); + + auto absNew = residual.dot(ctx, p); + Index maxIters = 1000; + Index i = 0; + while (i < maxIters) { + tmp.noalias() = mat * p; + + auto alpha = absNew / p.dot(ctx, tmp); + d_x += alpha * p; + residual -= alpha * tmp; + + residualNorm2 = residual.squaredNorm(ctx); + if (residualNorm2 < threshold) break; + + // z = r (no preconditioner) + z.copyFrom(ctx, residual); + + auto absOld = std::move(absNew); + absNew = residual.dot(ctx, z); + auto beta = absNew / absOld; + + p *= Scalar(beta); + p += z; + i++; + } + + GpuContext::setThreadLocal(nullptr); + + Vec x_gpu = d_x.toHost(ctx.stream()); + + // Verify residual. + Vec r = A * x_gpu - b; + RealScalar relres = r.norm() / b.norm(); + VERIFY(relres < RealScalar(1e-6)); + + // Compare with CPU. + RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits::epsilon(); + VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol); +} + +// ---- GPU CG with Jacobi preconditioner -------------------------------------- + +template +void test_gpu_cg_jacobi(Index n) { + using SpMat = SparseMatrix; + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + SpMat A = make_spd(n); + Vec b = Vec::Random(n); + + // CPU reference. + ConjugateGradient cpu_cg; + cpu_cg.setMaxIterations(1000); + cpu_cg.setTolerance(RealScalar(1e-8)); + cpu_cg.compute(A); + Vec x_cpu = cpu_cg.solve(b); + + // Extract inverse diagonal. + Vec invdiag(n); + for (Index j = 0; j < A.outerSize(); ++j) { + typename SpMat::InnerIterator it(A, j); + while (it && it.index() != j) ++it; + if (it && it.index() == j && it.value() != Scalar(0)) + invdiag(j) = Scalar(1) / it.value(); + else + invdiag(j) = Scalar(1); + } + + // GPU CG with Jacobi preconditioner. + GpuContext ctx; + GpuContext::setThreadLocal(&ctx); + GpuSparseContext spmv_ctx(ctx); + auto mat = spmv_ctx.deviceView(A); + auto d_invdiag = DeviceMatrix::fromHost(invdiag, ctx.stream()); + + auto d_b = DeviceMatrix::fromHost(b, ctx.stream()); + DeviceMatrix d_x(n, 1); + d_x.setZero(ctx); + + DeviceMatrix residual(n, 1); + residual.copyFrom(ctx, d_b); + + RealScalar rhsNorm2 = d_b.squaredNorm(ctx); + RealScalar tol = RealScalar(1e-8); + RealScalar threshold = tol * tol * rhsNorm2; + RealScalar residualNorm2 = residual.squaredNorm(ctx); + + // p = precond.solve(r) = invdiag .* r + DeviceMatrix p = d_invdiag.cwiseProduct(ctx, residual); + DeviceMatrix z(n, 1), tmp(n, 1); + + auto absNew = residual.dot(ctx, p); + Index maxIters = 1000; + Index i = 0; + while (i < maxIters) { + tmp.noalias() = mat * p; + + auto alpha = absNew / p.dot(ctx, tmp); + d_x += alpha * p; + residual -= alpha * tmp; + + residualNorm2 = residual.squaredNorm(ctx); + if (residualNorm2 < threshold) break; + + // z = precond.solve(r) = invdiag .* r + z.cwiseProduct(ctx, d_invdiag, residual); + + auto absOld = std::move(absNew); + absNew = residual.dot(ctx, z); + auto beta = absNew / absOld; + + p *= beta; + p += z; + i++; + } + + GpuContext::setThreadLocal(nullptr); + + Vec x_gpu = d_x.toHost(ctx.stream()); + + Vec r = A * x_gpu - b; + RealScalar relres = r.norm() / b.norm(); + VERIFY(relres < RealScalar(1e-6)); + + RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits::epsilon(); + VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol); +} + +EIGEN_DECLARE_TEST(gpu_cg) { + CALL_SUBTEST(test_gpu_cg(64)); + CALL_SUBTEST(test_gpu_cg(256)); + CALL_SUBTEST(test_gpu_cg(64)); + CALL_SUBTEST(test_gpu_cg_jacobi(64)); + CALL_SUBTEST(test_gpu_cg_jacobi(256)); + CALL_SUBTEST(test_gpu_cg_jacobi(64)); +} diff --git a/test/gpu_cusparse_spmv.cpp b/test/gpu_cusparse_spmv.cpp index 8ccff5f65..4bc517fbf 100644 --- a/test/gpu_cusparse_spmv.cpp +++ b/test/gpu_cusparse_spmv.cpp @@ -180,6 +180,105 @@ void test_empty() { VERIFY_IS_EQUAL(y.size(), 0); } +// ---- DeviceMatrix SpMV (no host roundtrip) ---------------------------------- + +template +void test_spmv_device(Index n) { + using SpMat = SparseMatrix; + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + SpMat A = make_sparse(n, n); + Vec x = Vec::Random(n); + + // Use shared GpuContext for same-stream execution. + GpuContext gpu_ctx; + GpuSparseContext ctx(gpu_ctx); + + auto d_x = DeviceMatrix::fromHost(x, gpu_ctx.stream()); + DeviceMatrix d_y; + + ctx.multiply(A, d_x, d_y); + + Vec y_gpu = d_y.toHost(gpu_ctx.stream()); + Vec y_cpu = A * x; + + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits::epsilon(); + VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol); +} + +// ---- Expression syntax: d_y = d_A * d_x ------------------------------------ + +template +void test_spmv_expr(Index n) { + using SpMat = SparseMatrix; + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + SpMat A = make_sparse(n, n); + Vec x = Vec::Random(n); + + GpuContext gpu_ctx; + GpuSparseContext ctx(gpu_ctx); + + // Upload sparse matrix and create device view. + auto d_A = ctx.deviceView(A); + + // Upload x. + auto d_x = DeviceMatrix::fromHost(x, gpu_ctx.stream()); + + // Expression syntax: d_y = d_A * d_x + DeviceMatrix d_y; + d_y = d_A * d_x; + + // Also test with noalias(): + DeviceMatrix d_tmp; + d_tmp.noalias() = d_A * d_x; + + Vec y_gpu = d_y.toHost(gpu_ctx.stream()); + Vec tmp_gpu = d_tmp.toHost(gpu_ctx.stream()); + Vec y_cpu = A * x; + + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits::epsilon(); + VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol); + VERIFY((tmp_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol); +} + +// ---- deviceView overwrite: second view replaces first ----------------------- + +template +void test_deviceview_overwrite(Index n) { + using SpMat = SparseMatrix; + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + SpMat A1 = make_sparse(n, n); + SpMat A2 = make_sparse(n, n); // different random matrix + + Vec x = Vec::Random(n); + + GpuContext gpu_ctx; + GpuSparseContext ctx(gpu_ctx); + + // First view: A1. + auto d_A1 = ctx.deviceView(A1); + auto d_x = DeviceMatrix::fromHost(x, gpu_ctx.stream()); + DeviceMatrix d_y1; + d_y1 = d_A1 * d_x; + Vec y1_gpu = d_y1.toHost(gpu_ctx.stream()); + Vec y1_cpu = A1 * x; + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits::epsilon(); + VERIFY((y1_gpu - y1_cpu).norm() / (y1_cpu.norm() + RealScalar(1)) < tol); + + // Second view overwrites first: now uses A2. + auto d_A2 = ctx.deviceView(A2); + DeviceMatrix d_y2; + d_y2 = d_A2 * d_x; + Vec y2_gpu = d_y2.toHost(gpu_ctx.stream()); + Vec y2_cpu = A2 * x; + VERIFY((y2_gpu - y2_cpu).norm() / (y2_cpu.norm() + RealScalar(1)) < tol); +} + // ---- Per-scalar driver ------------------------------------------------------ template @@ -193,6 +292,9 @@ void test_scalar() { CALL_SUBTEST(test_identity(64)); CALL_SUBTEST(test_reuse(64)); CALL_SUBTEST(test_empty()); + CALL_SUBTEST(test_spmv_device(64)); + CALL_SUBTEST(test_spmv_expr(64)); + CALL_SUBTEST(test_deviceview_overwrite(64)); } EIGEN_DECLARE_TEST(gpu_cusparse_spmv) { diff --git a/test/gpu_device_matrix.cpp b/test/gpu_device_matrix.cpp index 1ec0ce239..3e1d006ac 100644 --- a/test/gpu_device_matrix.cpp +++ b/test/gpu_device_matrix.cpp @@ -12,6 +12,7 @@ #define EIGEN_USE_GPU #include "main.h" +#include #include using namespace Eigen; @@ -230,6 +231,217 @@ void test_scalar() { CALL_SUBTEST(test_move_assign(64, 64)); } +// ---- BLAS-1: dot product ---------------------------------------------------- + +template +void test_blas1(Index n) { + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + // All BLAS-1 ops share one GpuContext — same stream, zero event overhead. + GpuContext ctx; + + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits::epsilon(); + + // dot + { + Vec a = Vec::Random(n); + Vec b = Vec::Random(n); + auto d_a = DeviceMatrix::fromHost(a, ctx.stream()); + auto d_b = DeviceMatrix::fromHost(b, ctx.stream()); + Scalar gpu_dot = d_a.dot(ctx, d_b); + Scalar cpu_dot = a.dot(b); + VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol); + } + + // norm / squaredNorm + { + Vec a = Vec::Random(n); + auto d_a = DeviceMatrix::fromHost(a, ctx.stream()); + RealScalar gpu_norm = d_a.norm(ctx); + RealScalar cpu_norm = a.norm(); + VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol); + RealScalar gpu_sqnorm = d_a.squaredNorm(ctx); + RealScalar cpu_sqnorm = a.squaredNorm(); + VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol); + } + + // addScaled (axpy) + { + Vec x = Vec::Random(n); + Vec y = Vec::Random(n); + Scalar alpha(2.5); + Vec y_ref = y + alpha * x; + auto d_y = DeviceMatrix::fromHost(y, ctx.stream()); + auto d_x = DeviceMatrix::fromHost(x, ctx.stream()); + d_y.addScaled(ctx, alpha, d_x); + Vec y_gpu = d_y.toHost(ctx.stream()); + VERIFY((y_gpu - y_ref).norm() < tol * y_ref.norm() + tol); + } + + // scale (scal) + { + Vec x = Vec::Random(n); + Scalar alpha(3.0); + Vec x_ref = alpha * x; + auto d_x = DeviceMatrix::fromHost(x, ctx.stream()); + d_x.scale(ctx, alpha); + Vec x_gpu = d_x.toHost(ctx.stream()); + VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol); + } + + // copyFrom + { + Vec x = Vec::Random(n); + auto d_x = DeviceMatrix::fromHost(x, ctx.stream()); + DeviceMatrix d_y; + d_y.copyFrom(ctx, d_x); + Vec y = d_y.toHost(ctx.stream()); + VERIFY_IS_APPROX(y, x); + } + + // setZero + { + Vec x = Vec::Random(n); + auto d_x = DeviceMatrix::fromHost(x, ctx.stream()); + d_x.setZero(ctx); + Vec result = d_x.toHost(ctx.stream()); + VERIFY_IS_EQUAL(result, Vec::Zero(n)); + } +} + +// ---- BLAS-1 operator overloads (CG-style) ----------------------------------- + +template +void test_cg_operators(Index n) { + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits::epsilon(); + + Vec x = Vec::Random(n); + Vec p = Vec::Random(n); + Vec tmp = Vec::Random(n); + Vec z = Vec::Random(n); + Scalar alpha(2.5); + Scalar beta(0.7); + + // Test: x += alpha * p + { + Vec x_ref = x + alpha * p; + auto d_x = DeviceMatrix::fromHost(x); + auto d_p = DeviceMatrix::fromHost(p); + d_x += alpha * d_p; + Vec x_gpu = d_x.toHost(); + VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol); + } + + // Test: r -= alpha * tmp + { + Vec r = Vec::Random(n); + Vec r_ref = r - alpha * tmp; + auto d_r = DeviceMatrix::fromHost(r); + auto d_tmp = DeviceMatrix::fromHost(tmp); + d_r -= alpha * d_tmp; + Vec r_gpu = d_r.toHost(); + VERIFY((r_gpu - r_ref).norm() < tol * r_ref.norm() + tol); + } + + // Test: p = z + beta * p (cuBLAS geam) + { + Vec p_copy = p; + Vec p_ref = z + beta * p_copy; + auto d_p = DeviceMatrix::fromHost(p_copy); + auto d_z = DeviceMatrix::fromHost(z); + d_p = d_z + beta * d_p; + Vec p_gpu = d_p.toHost(); + VERIFY((p_gpu - p_ref).norm() < tol * p_ref.norm() + tol); + } + + // Test: operator+= and operator-= with DeviceMatrix (no scalar) + { + Vec a = Vec::Random(n); + Vec b = Vec::Random(n); + Vec a_ref = a + b; + auto d_a = DeviceMatrix::fromHost(a); + auto d_b = DeviceMatrix::fromHost(b); + d_a += d_b; + VERIFY((d_a.toHost() - a_ref).norm() < tol * a_ref.norm() + tol); + } +} + +// ---- DeviceScalar: deferred sync ------------------------------------------- + +template +void test_device_scalar() { + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + const Index n = 256; + Vec a = Vec::Random(n); + Vec b = Vec::Random(n); + + GpuContext ctx; + auto d_a = DeviceMatrix::fromHost(a, ctx.stream()); + auto d_b = DeviceMatrix::fromHost(b, ctx.stream()); + + // dot() returns DeviceScalar — implicit conversion to Scalar syncs. + Scalar gpu_dot = d_a.dot(ctx, d_b); + Scalar cpu_dot = a.dot(b); + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits::epsilon(); + VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol); + + // squaredNorm() returns host RealScalar directly (syncs internally). + RealScalar gpu_sqnorm = d_a.squaredNorm(ctx); + RealScalar cpu_sqnorm = a.squaredNorm(); + VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol); + + // norm() returns DeviceScalar — implicit conversion syncs. + RealScalar gpu_norm = d_a.norm(ctx); + RealScalar cpu_norm = a.norm(); + VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol); + + // Convenience overloads (thread-local context). + GpuContext::setThreadLocal(&ctx); + Scalar gpu_dot2 = d_a.dot(d_b); + VERIFY(numext::abs(gpu_dot2 - cpu_dot) < tol * numext::abs(cpu_dot) + tol); + GpuContext::setThreadLocal(nullptr); + + // Empty vectors: dot and norm must return zero. + { + DeviceMatrix d_empty(0, 1); + DeviceMatrix d_empty2(0, 1); + Scalar empty_dot = d_empty.dot(ctx, d_empty2); + VERIFY_IS_EQUAL(empty_dot, Scalar(0)); + RealScalar empty_sqnorm = d_empty.squaredNorm(ctx); + VERIFY_IS_EQUAL(empty_sqnorm, RealScalar(0)); + RealScalar empty_norm = d_empty.norm(ctx); + VERIFY_IS_EQUAL(empty_norm, RealScalar(0)); + } +} + +// ---- cwiseProduct ----------------------------------------------------------- + +template +void test_cwiseProduct() { + using Vec = Matrix; + using RealScalar = typename NumTraits::Real; + + const Index n = 256; + Vec a = Vec::Random(n); + Vec b = Vec::Random(n); + Vec ref = a.array() * b.array(); + + GpuContext ctx; + auto d_a = DeviceMatrix::fromHost(a, ctx.stream()); + auto d_b = DeviceMatrix::fromHost(b, ctx.stream()); + auto d_c = d_a.cwiseProduct(ctx, d_b); + Vec result = d_c.toHost(ctx.stream()); + + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits::epsilon(); + VERIFY((result - ref).norm() < tol * ref.norm() + tol); +} + EIGEN_DECLARE_TEST(gpu_device_matrix) { CALL_SUBTEST(test_default_construct()); CALL_SUBTEST(test_empty()); @@ -242,4 +454,18 @@ EIGEN_DECLARE_TEST(gpu_device_matrix) { CALL_SUBTEST(test_scalar()); CALL_SUBTEST(test_scalar>()); CALL_SUBTEST(test_scalar>()); + CALL_SUBTEST(test_blas1(256)); + CALL_SUBTEST(test_blas1(256)); + CALL_SUBTEST(test_blas1>(256)); + CALL_SUBTEST(test_blas1>(256)); + CALL_SUBTEST(test_cg_operators(256)); + CALL_SUBTEST(test_cg_operators(256)); + CALL_SUBTEST(test_cg_operators>(256)); + CALL_SUBTEST(test_cg_operators>(256)); + CALL_SUBTEST(test_device_scalar()); + CALL_SUBTEST(test_device_scalar()); + CALL_SUBTEST(test_device_scalar>()); + CALL_SUBTEST(test_device_scalar>()); + CALL_SUBTEST(test_cwiseProduct()); + CALL_SUBTEST(test_cwiseProduct()); }