mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls. CPU and GPU solvers coexist in the same binary with compatible syntax. Core infrastructure: - DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with async host transfer (fromHost/toHost) and CUDA event-based cross-stream synchronization. - GpuContext: Unified execution context owning a CUDA stream + cuBLAS handle + cuSOLVER handle. Thread-local default with explicit override via setThreadLocal(). Stream-borrowing constructor for integration. - DeviceBuffer: Typed RAII device allocation with move semantics. cuBLAS dispatch (expression syntax): - GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm) - TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm) - SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm) - SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk) cuSOLVER dispatch: - GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs) - GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs) - Solver chaining: auto x = d_A.llt().solve(d_B) - Solver expressions with .device(ctx) for explicit stream control. CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11, Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit. Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp, gpu_device_matrix.cpp, gpu_library_example.cu Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp, bench_gpu_batching.cpp
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.10.0)
|
||||
cmake_minimum_required(VERSION 3.17)
|
||||
|
||||
#==============================================================================
|
||||
# CMake Policy issues.
|
||||
@@ -9,7 +9,7 @@ if (POLICY CMP0077)
|
||||
endif (POLICY CMP0077)
|
||||
|
||||
# NOTE Remove setting the policy once the minimum required CMake version is
|
||||
# increased to at least 3.15. Retain enabling the export to package registry.
|
||||
# increased to at least 3.21. Retain enabling the export to package registry.
|
||||
if (POLICY CMP0090)
|
||||
# The export command does not populate package registry by default
|
||||
cmake_policy (SET CMP0090 NEW)
|
||||
|
||||
55
Eigen/GPU
Normal file
55
Eigen/GPU
Normal file
@@ -0,0 +1,55 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_GPU_MODULE_H
|
||||
#define EIGEN_GPU_MODULE_H
|
||||
|
||||
#include "Core"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
/** \defgroup GPU_Module GPU module
|
||||
*
|
||||
* GPU-accelerated solvers and operations using NVIDIA CUDA libraries
|
||||
* (cuSOLVER, cuBLAS, cuSPARSE, cuFFT, cuDSS).
|
||||
*
|
||||
* This module provides explicit GPU solver classes that coexist with Eigen's
|
||||
* CPU solvers. Unlike the LAPACKE dispatch (which replaces the CPU
|
||||
* implementation globally), GPU classes are separate types the user
|
||||
* instantiates by choice:
|
||||
*
|
||||
* \code
|
||||
* #define EIGEN_USE_GPU
|
||||
* #include <Eigen/GPU>
|
||||
*
|
||||
* // CPU path (unchanged)
|
||||
* Eigen::LLT<Eigen::MatrixXd> llt_cpu(A);
|
||||
*
|
||||
* // GPU path (explicit)
|
||||
* Eigen::GpuLLT<double> llt_gpu(A); // L stays on device
|
||||
* auto X = llt_gpu.solve(B); // only B transferred per solve
|
||||
* \endcode
|
||||
*
|
||||
* Requires CUDA 11.4+. See CLAUDE.md.
|
||||
*/
|
||||
|
||||
#ifdef EIGEN_USE_GPU
|
||||
// IWYU pragma: begin_exports
|
||||
#include "src/GPU/DeviceMatrix.h"
|
||||
#include "src/GPU/GpuContext.h"
|
||||
#include "src/GPU/DeviceExpr.h"
|
||||
#include "src/GPU/DeviceBlasExpr.h"
|
||||
#include "src/GPU/DeviceSolverExpr.h"
|
||||
#include "src/GPU/DeviceDispatch.h"
|
||||
#include "src/GPU/GpuLLT.h"
|
||||
#include "src/GPU/GpuLU.h"
|
||||
// IWYU pragma: end_exports
|
||||
#endif
|
||||
|
||||
#include "src/Core/util/ReenableStupidWarnings.h"
|
||||
|
||||
#endif // EIGEN_GPU_MODULE_H
|
||||
160
Eigen/src/GPU/CuBlasSupport.h
Normal file
160
Eigen/src/GPU/CuBlasSupport.h
Normal file
@@ -0,0 +1,160 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// cuBLAS-specific support types:
|
||||
// - Error-checking macro
|
||||
// - Operation enum and mapping to cublasOperation_t
|
||||
//
|
||||
// Generic CUDA runtime utilities (DeviceBuffer, cuda_data_type) are in GpuSupport.h.
|
||||
|
||||
#ifndef EIGEN_GPU_CUBLAS_SUPPORT_H
|
||||
#define EIGEN_GPU_CUBLAS_SUPPORT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./GpuSupport.h"
|
||||
#include <cublas_v2.h>
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// ---- Error-checking macro ---------------------------------------------------
|
||||
|
||||
#define EIGEN_CUBLAS_CHECK(expr) \
|
||||
do { \
|
||||
cublasStatus_t _s = (expr); \
|
||||
eigen_assert(_s == CUBLAS_STATUS_SUCCESS && "cuBLAS call failed"); \
|
||||
} while (0)
|
||||
|
||||
// ---- Operation enum ---------------------------------------------------------
|
||||
// Maps transpose/adjoint flags to cublasOperation_t.
|
||||
|
||||
enum class GpuOp { NoTrans, Trans, ConjTrans };
|
||||
|
||||
constexpr cublasOperation_t to_cublas_op(GpuOp op) {
|
||||
switch (op) {
|
||||
case GpuOp::Trans:
|
||||
return CUBLAS_OP_T;
|
||||
case GpuOp::ConjTrans:
|
||||
return CUBLAS_OP_C;
|
||||
default:
|
||||
return CUBLAS_OP_N;
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Scalar → cublasComputeType_t -------------------------------------------
|
||||
// cublasGemmEx requires a compute type (separate from the data type).
|
||||
|
||||
template <typename Scalar>
|
||||
struct cuda_compute_type;
|
||||
|
||||
template <>
|
||||
struct cuda_compute_type<float> {
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
|
||||
};
|
||||
template <>
|
||||
struct cuda_compute_type<double> {
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
|
||||
};
|
||||
template <>
|
||||
struct cuda_compute_type<std::complex<float>> {
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
|
||||
};
|
||||
template <>
|
||||
struct cuda_compute_type<std::complex<double>> {
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
|
||||
};
|
||||
|
||||
// ---- Type-specific cuBLAS wrappers ------------------------------------------
|
||||
// cuBLAS uses separate functions per type (Strsm, Dtrsm, etc.).
|
||||
// These overloaded wrappers allow calling cublasXtrsm/cublasXsymm/cublasXsyrk
|
||||
// with any supported scalar type.
|
||||
|
||||
// TRSM wrappers
|
||||
inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
|
||||
cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha,
|
||||
const float* A, int lda, float* B, int ldb) {
|
||||
return cublasStrsm(h, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
|
||||
}
|
||||
inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
|
||||
cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha,
|
||||
const double* A, int lda, double* B, int ldb) {
|
||||
return cublasDtrsm(h, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
|
||||
}
|
||||
inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
|
||||
cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
|
||||
const std::complex<float>* alpha, const std::complex<float>* A, int lda,
|
||||
std::complex<float>* B, int ldb) {
|
||||
return cublasCtrsm(h, side, uplo, trans, diag, m, n, reinterpret_cast<const cuComplex*>(alpha),
|
||||
reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(B), ldb);
|
||||
}
|
||||
inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
|
||||
cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
|
||||
const std::complex<double>* alpha, const std::complex<double>* A, int lda,
|
||||
std::complex<double>* B, int ldb) {
|
||||
return cublasZtrsm(h, side, uplo, trans, diag, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
|
||||
reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<cuDoubleComplex*>(B), ldb);
|
||||
}
|
||||
|
||||
// SYMM wrappers (real → symm, complex → hemm)
|
||||
inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
|
||||
const float* alpha, const float* A, int lda, const float* B, int ldb,
|
||||
const float* beta, float* C, int ldc) {
|
||||
return cublasSsymm(h, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
|
||||
const double* alpha, const double* A, int lda, const double* B, int ldb,
|
||||
const double* beta, double* C, int ldc) {
|
||||
return cublasDsymm(h, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
|
||||
const std::complex<float>* alpha, const std::complex<float>* A, int lda,
|
||||
const std::complex<float>* B, int ldb, const std::complex<float>* beta,
|
||||
std::complex<float>* C, int ldc) {
|
||||
return cublasChemm(h, side, uplo, m, n, reinterpret_cast<const cuComplex*>(alpha),
|
||||
reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<const cuComplex*>(B), ldb,
|
||||
reinterpret_cast<const cuComplex*>(beta), reinterpret_cast<cuComplex*>(C), ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
|
||||
const std::complex<double>* alpha, const std::complex<double>* A, int lda,
|
||||
const std::complex<double>* B, int ldb, const std::complex<double>* beta,
|
||||
std::complex<double>* C, int ldc) {
|
||||
return cublasZhemm(h, side, uplo, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
|
||||
reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<const cuDoubleComplex*>(B), ldb,
|
||||
reinterpret_cast<const cuDoubleComplex*>(beta), reinterpret_cast<cuDoubleComplex*>(C), ldc);
|
||||
}
|
||||
|
||||
// SYRK wrappers (real → syrk, complex → herk)
|
||||
inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
|
||||
const float* alpha, const float* A, int lda, const float* beta, float* C, int ldc) {
|
||||
return cublasSsyrk(h, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
|
||||
const double* alpha, const double* A, int lda, const double* beta, double* C,
|
||||
int ldc) {
|
||||
return cublasDsyrk(h, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
|
||||
const float* alpha, const std::complex<float>* A, int lda, const float* beta,
|
||||
std::complex<float>* C, int ldc) {
|
||||
return cublasCherk(h, uplo, trans, n, k, alpha, reinterpret_cast<const cuComplex*>(A), lda, beta,
|
||||
reinterpret_cast<cuComplex*>(C), ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
|
||||
const double* alpha, const std::complex<double>* A, int lda, const double* beta,
|
||||
std::complex<double>* C, int ldc) {
|
||||
return cublasZherk(h, uplo, trans, n, k, alpha, reinterpret_cast<const cuDoubleComplex*>(A), lda, beta,
|
||||
reinterpret_cast<cuDoubleComplex*>(C), ldc);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_CUBLAS_SUPPORT_H
|
||||
97
Eigen/src/GPU/CuSolverSupport.h
Normal file
97
Eigen/src/GPU/CuSolverSupport.h
Normal file
@@ -0,0 +1,97 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// cuSOLVER-specific support types:
|
||||
// - cuSOLVER error-checking macro
|
||||
// - RAII wrapper for cusolverDnParams
|
||||
// - Scalar → cudaDataType_t mapping
|
||||
// - (UpLo, StorageOrder) → cublasFillMode_t mapping
|
||||
//
|
||||
// Generic CUDA runtime utilities (DeviceBuffer, EIGEN_CUDA_RUNTIME_CHECK)
|
||||
// are in GpuSupport.h.
|
||||
|
||||
#ifndef EIGEN_GPU_CUSOLVER_SUPPORT_H
|
||||
#define EIGEN_GPU_CUSOLVER_SUPPORT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./GpuSupport.h"
|
||||
#include <cusolverDn.h>
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// ---- Error-checking macros --------------------------------------------------
|
||||
|
||||
#define EIGEN_CUSOLVER_CHECK(expr) \
|
||||
do { \
|
||||
cusolverStatus_t _s = (expr); \
|
||||
eigen_assert(_s == CUSOLVER_STATUS_SUCCESS && "cuSOLVER call failed"); \
|
||||
} while (0)
|
||||
|
||||
// ---- RAII: cusolverDnParams -------------------------------------------------
|
||||
|
||||
struct CusolverParams {
|
||||
cusolverDnParams_t p = nullptr;
|
||||
|
||||
CusolverParams() { EIGEN_CUSOLVER_CHECK(cusolverDnCreateParams(&p)); }
|
||||
|
||||
~CusolverParams() {
|
||||
if (p) (void)cusolverDnDestroyParams(p); // destructor: can't propagate
|
||||
}
|
||||
|
||||
// Move-only.
|
||||
CusolverParams(CusolverParams&& o) noexcept : p(o.p) { o.p = nullptr; }
|
||||
CusolverParams& operator=(CusolverParams&& o) noexcept {
|
||||
if (this != &o) {
|
||||
if (p) (void)cusolverDnDestroyParams(p);
|
||||
p = o.p;
|
||||
o.p = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
CusolverParams(const CusolverParams&) = delete;
|
||||
CusolverParams& operator=(const CusolverParams&) = delete;
|
||||
};
|
||||
|
||||
// ---- Scalar → cudaDataType_t ------------------------------------------------
|
||||
// Alias for backward compatibility. The canonical trait is cuda_data_type<> in GpuSupport.h.
|
||||
template <typename Scalar>
|
||||
using cusolver_data_type = cuda_data_type<Scalar>;
|
||||
|
||||
// ---- (UpLo, StorageOrder) → cublasFillMode_t --------------------------------
|
||||
// cuSOLVER always interprets the matrix as column-major. A row-major matrix A
|
||||
// appears as A^T to cuSOLVER, so the upper/lower triangle is swapped.
|
||||
|
||||
template <int UpLo, int StorageOrder>
|
||||
struct cusolver_fill_mode;
|
||||
|
||||
template <>
|
||||
struct cusolver_fill_mode<Lower, ColMajor> {
|
||||
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
|
||||
};
|
||||
template <>
|
||||
struct cusolver_fill_mode<Upper, ColMajor> {
|
||||
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_UPPER;
|
||||
};
|
||||
template <>
|
||||
struct cusolver_fill_mode<Lower, RowMajor> {
|
||||
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_UPPER;
|
||||
};
|
||||
template <>
|
||||
struct cusolver_fill_mode<Upper, RowMajor> {
|
||||
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_CUSOLVER_SUPPORT_H
|
||||
146
Eigen/src/GPU/DeviceBlasExpr.h
Normal file
146
Eigen/src/GPU/DeviceBlasExpr.h
Normal file
@@ -0,0 +1,146 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// BLAS Level 3 expression types for DeviceMatrix (beyond GEMM):
|
||||
// TrsmExpr → cublasXtrsm (triangular solve)
|
||||
// SymmExpr → cublasXsymm (symmetric multiply, real)
|
||||
// → cublasXhemm (Hermitian multiply, complex)
|
||||
// SyrkExpr → cublasXsyrk (symmetric rank-k update, real)
|
||||
// → cublasXherk (Hermitian rank-k update, complex)
|
||||
|
||||
#ifndef EIGEN_GPU_DEVICE_BLAS_EXPR_H
|
||||
#define EIGEN_GPU_DEVICE_BLAS_EXPR_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template <typename Scalar_>
|
||||
class DeviceMatrix;
|
||||
|
||||
// ---- DeviceTriangularView ---------------------------------------------------
|
||||
// d_A.triangularView<Lower>() → view with .solve(d_B)
|
||||
|
||||
template <typename Scalar_, int UpLo_>
|
||||
class DeviceTriangularView {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
explicit DeviceTriangularView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||
|
||||
/** Build a TRSM solve expression. */
|
||||
TrsmExpr<Scalar, UpLo_> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& mat_;
|
||||
};
|
||||
|
||||
// ---- TrsmExpr: triangularView<UpLo>().solve(B) → cublasXtrsm ---------------
|
||||
|
||||
template <typename Scalar_, int UpLo_>
|
||||
class TrsmExpr {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
TrsmExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||
const DeviceMatrix<Scalar>& rhs() const { return B_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& A_;
|
||||
const DeviceMatrix<Scalar>& B_;
|
||||
};
|
||||
|
||||
// ---- DeviceSelfAdjointView --------------------------------------------------
|
||||
// d_A.selfadjointView<Lower>() → view that can multiply: view * d_B
|
||||
|
||||
template <typename Scalar_, int UpLo_>
|
||||
class DeviceSelfAdjointView {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
explicit DeviceSelfAdjointView(DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||
DeviceMatrix<Scalar>& matrix() { return mat_; }
|
||||
|
||||
/** Rank-k update: C.selfadjointView<Lower>().rankUpdate(A, alpha)
|
||||
* computes C = alpha * A * A^H + C (lower triangle only).
|
||||
* Maps to cublasXsyrk (real) or cublasXherk (complex). */
|
||||
void rankUpdate(const DeviceMatrix<Scalar>& A, RealScalar alpha = RealScalar(1));
|
||||
|
||||
private:
|
||||
DeviceMatrix<Scalar>& mat_;
|
||||
};
|
||||
|
||||
// Const variant for multiplication only (no rankUpdate).
|
||||
template <typename Scalar_, int UpLo_>
|
||||
class ConstDeviceSelfAdjointView {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
explicit ConstDeviceSelfAdjointView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& mat_;
|
||||
};
|
||||
|
||||
// ---- SymmExpr: selfadjointView<UpLo>() * B → cublasXsymm/Xhemm ------------
|
||||
|
||||
template <typename Scalar_, int UpLo_>
|
||||
class SymmExpr {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
SymmExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||
const DeviceMatrix<Scalar>& rhs() const { return B_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& A_;
|
||||
const DeviceMatrix<Scalar>& B_;
|
||||
};
|
||||
|
||||
// operator*: DeviceSelfAdjointView * DeviceMatrix → SymmExpr (mutable and const variants)
|
||||
template <typename S, int UpLo>
|
||||
SymmExpr<S, UpLo> operator*(const DeviceSelfAdjointView<S, UpLo>& a, const DeviceMatrix<S>& b) {
|
||||
return {a.matrix(), b};
|
||||
}
|
||||
template <typename S, int UpLo>
|
||||
SymmExpr<S, UpLo> operator*(const ConstDeviceSelfAdjointView<S, UpLo>& a, const DeviceMatrix<S>& b) {
|
||||
return {a.matrix(), b};
|
||||
}
|
||||
|
||||
// ---- SyrkExpr: rankUpdate(A) → cublasXsyrk/Xherk ---------------------------
|
||||
// C.rankUpdate(A) computes C += A * A^H (or A^H * A depending on convention).
|
||||
|
||||
template <typename Scalar_, int UpLo_>
|
||||
class SyrkExpr {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
SyrkExpr(const DeviceMatrix<Scalar>& A) : A_(A) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& A_;
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_DEVICE_BLAS_EXPR_H
|
||||
506
Eigen/src/GPU/DeviceDispatch.h
Normal file
506
Eigen/src/GPU/DeviceDispatch.h
Normal file
@@ -0,0 +1,506 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Dispatch functions that map DeviceMatrix expressions to NVIDIA library calls.
|
||||
//
|
||||
// dispatch_gemm() — GemmExpr → cublasXgemm
|
||||
//
|
||||
// Each function documents the exact library call and parameters.
|
||||
|
||||
#ifndef EIGEN_GPU_DEVICE_DISPATCH_H
|
||||
#define EIGEN_GPU_DEVICE_DISPATCH_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./DeviceExpr.h"
|
||||
#include "./DeviceBlasExpr.h"
|
||||
#include "./DeviceSolverExpr.h"
|
||||
#include "./GpuContext.h"
|
||||
#include "./CuSolverSupport.h"
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// ---- GEMM dispatch ----------------------------------------------------------
|
||||
// GemmExpr<Lhs, Rhs> → cublasGemmEx(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
|
||||
//
|
||||
// The generic API cublasGemmEx handles all scalar types (float, double,
|
||||
// complex<float>, complex<double>) via cudaDataType_t.
|
||||
|
||||
template <typename Lhs, typename Rhs>
|
||||
void dispatch_gemm(
|
||||
GpuContext& ctx, DeviceMatrix<typename device_expr_traits<Lhs>::scalar_type>& dst, const GemmExpr<Lhs, Rhs>& expr,
|
||||
typename device_expr_traits<Lhs>::scalar_type beta_val,
|
||||
typename device_expr_traits<Lhs>::scalar_type alpha_scale = typename device_expr_traits<Lhs>::scalar_type(1)) {
|
||||
using Scalar = typename device_expr_traits<Lhs>::scalar_type;
|
||||
using traits_lhs = device_expr_traits<Lhs>;
|
||||
using traits_rhs = device_expr_traits<Rhs>;
|
||||
|
||||
const DeviceMatrix<Scalar>& A = traits_lhs::matrix(expr.lhs());
|
||||
const DeviceMatrix<Scalar>& B = traits_rhs::matrix(expr.rhs());
|
||||
|
||||
constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op);
|
||||
constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op);
|
||||
|
||||
// GEMM dimensions: C(m,n) = op(A)(m,k) * op(B)(k,n)
|
||||
// op(A) has dimensions (A.rows, A.cols) if NoTrans, (A.cols, A.rows) if Trans/ConjTrans.
|
||||
const int64_t m = (traits_lhs::op == GpuOp::NoTrans) ? A.rows() : A.cols();
|
||||
const int64_t k = (traits_lhs::op == GpuOp::NoTrans) ? A.cols() : A.rows();
|
||||
const int64_t n = (traits_rhs::op == GpuOp::NoTrans) ? B.cols() : B.rows();
|
||||
const int64_t rhs_k = (traits_rhs::op == GpuOp::NoTrans) ? B.rows() : B.cols();
|
||||
|
||||
eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch");
|
||||
|
||||
const int64_t lda = A.outerStride();
|
||||
const int64_t ldb = B.outerStride();
|
||||
|
||||
// Serialize all accesses to the destination buffer on this stream.
|
||||
if (!dst.empty()) {
|
||||
dst.waitReady(ctx.stream());
|
||||
}
|
||||
|
||||
// Allocate or resize destination.
|
||||
const bool resized = dst.empty() || dst.rows() != m || dst.cols() != n;
|
||||
if (resized) {
|
||||
dst.resize(m, n);
|
||||
}
|
||||
const int64_t ldc = dst.outerStride();
|
||||
|
||||
Scalar alpha_val = alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs());
|
||||
|
||||
// Wait for operands to be ready on this stream.
|
||||
A.waitReady(ctx.stream());
|
||||
B.waitReady(ctx.stream());
|
||||
|
||||
// If there is no existing valid destination to accumulate into, treat it as
|
||||
// zero rather than reading uninitialized memory.
|
||||
if (resized && beta_val != Scalar(0) && dst.sizeInBytes() > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
|
||||
}
|
||||
|
||||
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||
constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
|
||||
|
||||
EIGEN_CUBLAS_CHECK(cublasGemmEx(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n),
|
||||
static_cast<int>(k), &alpha_val, A.data(), dtype, static_cast<int>(lda), B.data(),
|
||||
dtype, static_cast<int>(ldb), &beta_val, dst.data(), dtype, static_cast<int>(ldc),
|
||||
compute, CUBLAS_GEMM_DEFAULT));
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
|
||||
// ---- LLT solve dispatch -----------------------------------------------------
|
||||
// LltSolveExpr → cusolverDnXpotrf (factorize) + cusolverDnXpotrs (solve).
|
||||
// No caching — factor and workspace are temporary. Syncs to check info.
|
||||
|
||||
template <typename Scalar, int UpLo>
|
||||
void dispatch_llt_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LltSolveExpr<Scalar, UpLo>& expr) {
|
||||
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||
const DeviceMatrix<Scalar>& B = expr.rhs();
|
||||
|
||||
eigen_assert(A.rows() == A.cols() && "LLT requires a square matrix");
|
||||
eigen_assert(B.rows() == A.rows() && "LLT solve: RHS rows must match matrix size");
|
||||
|
||||
const Index n = A.rows();
|
||||
const int64_t nrhs = static_cast<int64_t>(B.cols());
|
||||
|
||||
// Zero-size fast paths: no work, just resize dst.
|
||||
// Wait on dst before resize to avoid freeing memory another stream is using.
|
||||
if (n == 0 || nrhs == 0) {
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
dst.resize(n == 0 ? 0 : n, B.cols());
|
||||
return;
|
||||
}
|
||||
|
||||
A.waitReady(ctx.stream());
|
||||
B.waitReady(ctx.stream());
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
|
||||
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||
constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo, ColMajor>::value;
|
||||
const int64_t lda = static_cast<int64_t>(A.outerStride());
|
||||
const int64_t ldb = static_cast<int64_t>(B.outerStride());
|
||||
eigen_assert(ldb == static_cast<int64_t>(B.rows()) && "DeviceMatrix must be densely packed");
|
||||
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
|
||||
// D2D copy A → factor buffer (potrf is in-place).
|
||||
DeviceBuffer d_factor(mat_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_factor.ptr, A.data(), mat_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||
|
||||
// Query workspace and factorize.
|
||||
CusolverParams params;
|
||||
DeviceBuffer d_factorize_info(sizeof(int));
|
||||
size_t dev_ws = 0, host_ws = 0;
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), dtype,
|
||||
d_factor.ptr, lda, dtype, &dev_ws, &host_ws));
|
||||
|
||||
DeviceBuffer d_workspace(dev_ws);
|
||||
std::vector<char> h_workspace(host_ws);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(
|
||||
ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), dtype, d_factor.ptr, lda, dtype, d_workspace.ptr,
|
||||
dev_ws, host_ws > 0 ? h_workspace.data() : nullptr, host_ws, static_cast<int*>(d_factorize_info.ptr)));
|
||||
|
||||
// Check factorization info before proceeding to solve.
|
||||
int factorize_info = 0;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&factorize_info, d_factorize_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
|
||||
eigen_assert(factorize_info == 0 && "cuSOLVER LLT factorization failed (matrix not positive definite)");
|
||||
|
||||
// D2D copy B → dst (potrs is in-place on the RHS).
|
||||
dst.resize(n, B.cols());
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||
|
||||
// Solve.
|
||||
DeviceBuffer d_solve_info(sizeof(int));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), nrhs, dtype,
|
||||
d_factor.ptr, lda, dtype, dst.data(), static_cast<int64_t>(dst.outerStride()),
|
||||
static_cast<int*>(d_solve_info.ptr)));
|
||||
|
||||
// Sync to ensure workspace locals can be freed safely.
|
||||
int solve_info = 0;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&solve_info, d_solve_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
|
||||
eigen_assert(solve_info == 0 && "cuSOLVER LLT solve failed");
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
|
||||
// ---- LU solve dispatch ------------------------------------------------------
|
||||
// LuSolveExpr → cusolverDnXgetrf (factorize) + cusolverDnXgetrs (solve).
|
||||
|
||||
template <typename Scalar>
|
||||
void dispatch_lu_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LuSolveExpr<Scalar>& expr) {
|
||||
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||
const DeviceMatrix<Scalar>& B = expr.rhs();
|
||||
|
||||
eigen_assert(A.rows() == A.cols() && "LU requires a square matrix");
|
||||
eigen_assert(B.rows() == A.rows() && "LU solve: RHS rows must match matrix size");
|
||||
|
||||
const Index n = A.rows();
|
||||
const int64_t nrhs = static_cast<int64_t>(B.cols());
|
||||
|
||||
if (n == 0 || nrhs == 0) {
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
dst.resize(n == 0 ? 0 : n, B.cols());
|
||||
return;
|
||||
}
|
||||
|
||||
A.waitReady(ctx.stream());
|
||||
B.waitReady(ctx.stream());
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
|
||||
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||
const int64_t lda = static_cast<int64_t>(A.outerStride());
|
||||
const int64_t ldb = static_cast<int64_t>(B.outerStride());
|
||||
eigen_assert(ldb == static_cast<int64_t>(B.rows()) && "DeviceMatrix must be densely packed");
|
||||
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t);
|
||||
|
||||
// D2D copy A → LU buffer (getrf is in-place).
|
||||
DeviceBuffer d_lu(mat_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu.ptr, A.data(), mat_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||
|
||||
DeviceBuffer d_ipiv(ipiv_bytes);
|
||||
|
||||
// Query workspace and factorize.
|
||||
CusolverParams params;
|
||||
DeviceBuffer d_factorize_info(sizeof(int));
|
||||
size_t dev_ws = 0, host_ws = 0;
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(ctx.cusolverHandle(), params.p, static_cast<int64_t>(n),
|
||||
static_cast<int64_t>(n), dtype, d_lu.ptr, lda, dtype, &dev_ws,
|
||||
&host_ws));
|
||||
|
||||
DeviceBuffer d_workspace(dev_ws);
|
||||
std::vector<char> h_workspace(host_ws);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(
|
||||
cusolverDnXgetrf(ctx.cusolverHandle(), params.p, static_cast<int64_t>(n), static_cast<int64_t>(n), dtype,
|
||||
d_lu.ptr, lda, static_cast<int64_t*>(d_ipiv.ptr), dtype, d_workspace.ptr, dev_ws,
|
||||
host_ws > 0 ? h_workspace.data() : nullptr, host_ws, static_cast<int*>(d_factorize_info.ptr)));
|
||||
|
||||
// Check factorization info before proceeding to solve.
|
||||
int factorize_info = 0;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&factorize_info, d_factorize_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
|
||||
eigen_assert(factorize_info == 0 && "cuSOLVER LU factorization failed (singular matrix)");
|
||||
|
||||
// D2D copy B → dst (getrs is in-place on the RHS).
|
||||
dst.resize(n, B.cols());
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||
|
||||
// Solve (NoTranspose).
|
||||
DeviceBuffer d_solve_info(sizeof(int));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, static_cast<int64_t>(n), nrhs,
|
||||
dtype, d_lu.ptr, lda, static_cast<const int64_t*>(d_ipiv.ptr), dtype,
|
||||
dst.data(), static_cast<int64_t>(dst.outerStride()),
|
||||
static_cast<int*>(d_solve_info.ptr)));
|
||||
|
||||
// Sync to ensure workspace locals can be freed safely.
|
||||
int solve_info = 0;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&solve_info, d_solve_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
|
||||
eigen_assert(solve_info == 0 && "cuSOLVER LU solve failed");
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
|
||||
// ---- TRSM dispatch ----------------------------------------------------------
|
||||
// TrsmExpr → cublasXtrsm: solve op(A) * X = B where A is triangular.
|
||||
// Side=Left, Diag=NonUnit. A is square, B is n×nrhs.
|
||||
|
||||
template <typename Scalar, int UpLo>
|
||||
void dispatch_trsm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const TrsmExpr<Scalar, UpLo>& expr) {
|
||||
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||
const DeviceMatrix<Scalar>& B = expr.rhs();
|
||||
|
||||
eigen_assert(A.rows() == A.cols() && "TRSM requires a square triangular matrix");
|
||||
eigen_assert(B.rows() == A.rows() && "TRSM: RHS rows must match matrix size");
|
||||
|
||||
const int n = static_cast<int>(A.rows());
|
||||
const int nrhs = static_cast<int>(B.cols());
|
||||
|
||||
if (n == 0 || nrhs == 0) {
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
dst.resize(n == 0 ? 0 : n, B.cols());
|
||||
return;
|
||||
}
|
||||
|
||||
A.waitReady(ctx.stream());
|
||||
B.waitReady(ctx.stream());
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
|
||||
// D2D copy B → dst (trsm is in-place on the RHS).
|
||||
dst.resize(n, B.cols());
|
||||
const size_t rhs_bytes = static_cast<size_t>(dst.outerStride()) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||
|
||||
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
||||
Scalar alpha(1);
|
||||
|
||||
EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs,
|
||||
&alpha, A.data(), static_cast<int>(A.outerStride()), dst.data(),
|
||||
static_cast<int>(dst.outerStride())));
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
|
||||
// ---- SYMM/HEMM dispatch -----------------------------------------------------
|
||||
// SymmExpr → cublasXsymm (real) or cublasXhemm (complex).
|
||||
// C = A * B where A is symmetric/Hermitian. Side=Left.
|
||||
|
||||
template <typename Scalar, int UpLo>
|
||||
void dispatch_symm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SymmExpr<Scalar, UpLo>& expr) {
|
||||
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||
const DeviceMatrix<Scalar>& B = expr.rhs();
|
||||
|
||||
eigen_assert(A.rows() == A.cols() && "SYMM requires a square matrix");
|
||||
eigen_assert(B.rows() == A.rows() && "SYMM: RHS rows must match matrix size");
|
||||
|
||||
const int m = static_cast<int>(A.rows());
|
||||
const int n = static_cast<int>(B.cols());
|
||||
|
||||
if (m == 0 || n == 0) {
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
dst.resize(m == 0 ? 0 : m, B.cols());
|
||||
return;
|
||||
}
|
||||
|
||||
A.waitReady(ctx.stream());
|
||||
B.waitReady(ctx.stream());
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
|
||||
dst.resize(m, n);
|
||||
|
||||
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
||||
Scalar alpha(1), beta(0);
|
||||
|
||||
EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &alpha, A.data(),
|
||||
static_cast<int>(A.outerStride()), B.data(), static_cast<int>(B.outerStride()), &beta,
|
||||
dst.data(), static_cast<int>(dst.outerStride())));
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
|
||||
// ---- SYRK/HERK dispatch -----------------------------------------------------
|
||||
// SyrkExpr → cublasXsyrk (real) or cublasXherk (complex).
|
||||
// C = alpha * A * A^H + beta * C. UpLo specifies which triangle of C is stored.
|
||||
|
||||
template <typename Scalar, int UpLo>
|
||||
void dispatch_syrk(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SyrkExpr<Scalar, UpLo>& expr,
|
||||
typename NumTraits<Scalar>::Real alpha_val, typename NumTraits<Scalar>::Real beta_val) {
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||
|
||||
const int n = static_cast<int>(A.rows());
|
||||
const int k = static_cast<int>(A.cols());
|
||||
|
||||
if (n == 0) {
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
dst.resize(0, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
A.waitReady(ctx.stream());
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
|
||||
if (dst.empty() || dst.rows() != n || dst.cols() != n) {
|
||||
dst.resize(n, n);
|
||||
if (beta_val != RealScalar(0)) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
|
||||
}
|
||||
}
|
||||
|
||||
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
||||
|
||||
EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(),
|
||||
static_cast<int>(A.outerStride()), &beta_val, dst.data(),
|
||||
static_cast<int>(dst.outerStride())));
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// ---- DeviceAssignment: d_C.device(ctx) = expr ------------------------------
|
||||
// Returned by DeviceMatrix::device(ctx). Dispatches expressions to library calls.
|
||||
|
||||
template <typename Scalar_>
|
||||
class DeviceAssignment {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
|
||||
DeviceAssignment(DeviceMatrix<Scalar>& dst, GpuContext& ctx) : dst_(dst), ctx_(ctx) {}
|
||||
|
||||
// operator= dispatches GEMM with beta=0 (overwrite).
|
||||
template <typename Lhs, typename Rhs>
|
||||
DeviceMatrix<Scalar>& operator=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||
internal::dispatch_gemm(ctx_, dst_, expr, Scalar(0));
|
||||
return dst_;
|
||||
}
|
||||
|
||||
// operator+= dispatches GEMM with beta=1 (accumulate).
|
||||
template <typename Lhs, typename Rhs>
|
||||
DeviceMatrix<Scalar>& operator+=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||
internal::dispatch_gemm(ctx_, dst_, expr, Scalar(1));
|
||||
return dst_;
|
||||
}
|
||||
|
||||
// operator-= dispatches GEMM with negated alpha, beta=1: C = C - alpha*op(A)*op(B).
|
||||
template <typename Lhs, typename Rhs>
|
||||
DeviceMatrix<Scalar>& operator-=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||
internal::dispatch_gemm(ctx_, dst_, expr, Scalar(1), Scalar(-1));
|
||||
return dst_;
|
||||
}
|
||||
|
||||
// operator= dispatches LLT solve (potrf + potrs).
|
||||
template <int UpLo>
|
||||
DeviceMatrix<Scalar>& operator=(const LltSolveExpr<Scalar, UpLo>& expr) {
|
||||
internal::dispatch_llt_solve(ctx_, dst_, expr);
|
||||
return dst_;
|
||||
}
|
||||
|
||||
// operator= dispatches LU solve (getrf + getrs).
|
||||
DeviceMatrix<Scalar>& operator=(const LuSolveExpr<Scalar>& expr) {
|
||||
internal::dispatch_lu_solve(ctx_, dst_, expr);
|
||||
return dst_;
|
||||
}
|
||||
|
||||
// operator= dispatches TRSM (triangular solve).
|
||||
template <int UpLo>
|
||||
DeviceMatrix<Scalar>& operator=(const TrsmExpr<Scalar, UpLo>& expr) {
|
||||
internal::dispatch_trsm(ctx_, dst_, expr);
|
||||
return dst_;
|
||||
}
|
||||
|
||||
// operator= dispatches SYMM/HEMM (symmetric/Hermitian multiply).
|
||||
template <int UpLo>
|
||||
DeviceMatrix<Scalar>& operator=(const SymmExpr<Scalar, UpLo>& expr) {
|
||||
internal::dispatch_symm(ctx_, dst_, expr);
|
||||
return dst_;
|
||||
}
|
||||
|
||||
// Catch-all: static_assert for unsupported expressions.
|
||||
template <typename Expr>
|
||||
DeviceMatrix<Scalar>& operator=(const Expr&) {
|
||||
static_assert(sizeof(Expr) == 0,
|
||||
"DeviceMatrix expression not supported: no cuBLAS/cuSOLVER mapping. "
|
||||
"Supported: GEMM (A*B), TRSM (.triangularView().solve()), "
|
||||
"SYMM (.selfadjointView()*B), LLT (.llt().solve()), LU (.lu().solve()).");
|
||||
return dst_;
|
||||
}
|
||||
|
||||
private:
|
||||
DeviceMatrix<Scalar>& dst_;
|
||||
GpuContext& ctx_;
|
||||
};
|
||||
|
||||
// ---- Out-of-line DeviceMatrix expression operator= definitions -------------
|
||||
// These are declared in DeviceMatrix.h but defined here because they need
|
||||
// GpuContext::threadLocal() which requires the full GpuContext definition.
|
||||
|
||||
template <typename Scalar_>
|
||||
template <typename Lhs, typename Rhs>
|
||||
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||
device(GpuContext::threadLocal()) = expr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename Scalar_>
|
||||
template <typename Lhs, typename Rhs>
|
||||
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||
device(GpuContext::threadLocal()) += expr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename Scalar_>
|
||||
template <int UpLo>
|
||||
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const LltSolveExpr<Scalar_, UpLo>& expr) {
|
||||
device(GpuContext::threadLocal()) = expr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename Scalar_>
|
||||
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const LuSolveExpr<Scalar_>& expr) {
|
||||
device(GpuContext::threadLocal()) = expr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename Scalar_>
|
||||
template <int UpLo>
|
||||
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const TrsmExpr<Scalar_, UpLo>& expr) {
|
||||
device(GpuContext::threadLocal()) = expr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename Scalar_>
|
||||
template <int UpLo>
|
||||
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const SymmExpr<Scalar_, UpLo>& expr) {
|
||||
device(GpuContext::threadLocal()) = expr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// DeviceSelfAdjointView::rankUpdate — defined here because it needs GpuContext.
|
||||
template <typename Scalar_, int UpLo_>
|
||||
void DeviceSelfAdjointView<Scalar_, UpLo_>::rankUpdate(const DeviceMatrix<Scalar_>& A, RealScalar alpha) {
|
||||
SyrkExpr<Scalar_, UpLo_> expr(A);
|
||||
RealScalar beta = matrix().empty() ? RealScalar(0) : RealScalar(1);
|
||||
internal::dispatch_syrk(GpuContext::threadLocal(), matrix(), expr, alpha, beta);
|
||||
}
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_DEVICE_DISPATCH_H
|
||||
224
Eigen/src/GPU/DeviceExpr.h
Normal file
224
Eigen/src/GPU/DeviceExpr.h
Normal file
@@ -0,0 +1,224 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Lightweight expression types for DeviceMatrix operations.
|
||||
//
|
||||
// These are NOT Eigen expression templates. Each type maps 1:1 to a single
|
||||
// NVIDIA library call (cuBLAS or cuSOLVER). There is no coefficient-level
|
||||
// evaluation, no lazy fusion, no packet operations.
|
||||
//
|
||||
// Expression types:
|
||||
// DeviceAdjointView<S> — d_A.adjoint() → marks ConjTrans for GEMM
|
||||
// DeviceTransposeView<S> — d_A.transpose() → marks Trans for GEMM
|
||||
// DeviceScaled<Expr> — alpha * expr → carries scalar factor
|
||||
// GemmExpr<Lhs, Rhs> — lhs * rhs → dispatches to cublasXgemm
|
||||
|
||||
#ifndef EIGEN_GPU_DEVICE_EXPR_H
|
||||
#define EIGEN_GPU_DEVICE_EXPR_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuBlasSupport.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// Forward declaration.
|
||||
template <typename Scalar_>
|
||||
class DeviceMatrix;
|
||||
|
||||
namespace internal {
|
||||
|
||||
// ---- Traits: extract operation info from expression types -------------------
|
||||
|
||||
// Default: a DeviceMatrix is NoTrans.
|
||||
template <typename T>
|
||||
struct device_expr_traits {
|
||||
static constexpr bool is_device_expr = false;
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
struct device_expr_traits<DeviceMatrix<Scalar>> {
|
||||
using scalar_type = Scalar;
|
||||
static constexpr GpuOp op = GpuOp::NoTrans;
|
||||
static constexpr bool is_device_expr = true;
|
||||
static const DeviceMatrix<Scalar>& matrix(const DeviceMatrix<Scalar>& x) { return x; }
|
||||
static Scalar alpha(const DeviceMatrix<Scalar>&) { return Scalar(1); }
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// ---- DeviceAdjointView: marks ConjTrans ------------------------------------
|
||||
// Returned by DeviceMatrix::adjoint(). Maps to cublasXgemm transA/B = C.
|
||||
|
||||
template <typename Scalar_>
|
||||
class DeviceAdjointView {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
explicit DeviceAdjointView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& mat_;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
template <typename Scalar>
|
||||
struct device_expr_traits<DeviceAdjointView<Scalar>> {
|
||||
using scalar_type = Scalar;
|
||||
static constexpr GpuOp op = GpuOp::ConjTrans;
|
||||
static constexpr bool is_device_expr = true;
|
||||
static const DeviceMatrix<Scalar>& matrix(const DeviceAdjointView<Scalar>& x) { return x.matrix(); }
|
||||
static Scalar alpha(const DeviceAdjointView<Scalar>&) { return Scalar(1); }
|
||||
};
|
||||
} // namespace internal
|
||||
|
||||
// ---- DeviceTransposeView: marks Trans --------------------------------------
|
||||
// Returned by DeviceMatrix::transpose(). Maps to cublasXgemm transA/B = T.
|
||||
|
||||
template <typename Scalar_>
|
||||
class DeviceTransposeView {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
explicit DeviceTransposeView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& mat_;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
template <typename Scalar>
|
||||
struct device_expr_traits<DeviceTransposeView<Scalar>> {
|
||||
using scalar_type = Scalar;
|
||||
static constexpr GpuOp op = GpuOp::Trans;
|
||||
static constexpr bool is_device_expr = true;
|
||||
static const DeviceMatrix<Scalar>& matrix(const DeviceTransposeView<Scalar>& x) { return x.matrix(); }
|
||||
static Scalar alpha(const DeviceTransposeView<Scalar>&) { return Scalar(1); }
|
||||
};
|
||||
} // namespace internal
|
||||
|
||||
// ---- DeviceScaled: alpha * expr --------------------------------------------
|
||||
// Returned by operator*(Scalar, DeviceMatrix/View). Carries the scalar factor.
|
||||
|
||||
template <typename Inner>
|
||||
class DeviceScaled {
|
||||
public:
|
||||
using Scalar = typename internal::device_expr_traits<Inner>::scalar_type;
|
||||
DeviceScaled(Scalar alpha, const Inner& inner) : alpha_(alpha), inner_(inner) {}
|
||||
Scalar scalar() const { return alpha_; }
|
||||
const Inner& inner() const { return inner_; }
|
||||
|
||||
private:
|
||||
Scalar alpha_;
|
||||
const Inner& inner_;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
template <typename Inner>
|
||||
struct device_expr_traits<DeviceScaled<Inner>> {
|
||||
using scalar_type = typename device_expr_traits<Inner>::scalar_type;
|
||||
static constexpr GpuOp op = device_expr_traits<Inner>::op;
|
||||
static constexpr bool is_device_expr = true;
|
||||
static const DeviceMatrix<scalar_type>& matrix(const DeviceScaled<Inner>& x) {
|
||||
return device_expr_traits<Inner>::matrix(x.inner());
|
||||
}
|
||||
static scalar_type alpha(const DeviceScaled<Inner>& x) {
|
||||
return x.scalar() * device_expr_traits<Inner>::alpha(x.inner());
|
||||
}
|
||||
};
|
||||
} // namespace internal
|
||||
|
||||
// ---- GemmExpr: lhs * rhs → cublasXgemm ------------------------------------
|
||||
// Returned by operator*(lhs_expr, rhs_expr). Dispatches to cuBLAS GEMM.
|
||||
|
||||
template <typename Lhs, typename Rhs>
|
||||
class GemmExpr {
|
||||
public:
|
||||
using Scalar = typename internal::device_expr_traits<Lhs>::scalar_type;
|
||||
static_assert(std::is_same<Scalar, typename internal::device_expr_traits<Rhs>::scalar_type>::value,
|
||||
"DeviceMatrix GEMM: LHS and RHS must have the same scalar type");
|
||||
|
||||
GemmExpr(const Lhs& lhs, const Rhs& rhs) : lhs_(lhs), rhs_(rhs) {}
|
||||
const Lhs& lhs() const { return lhs_; }
|
||||
const Rhs& rhs() const { return rhs_; }
|
||||
|
||||
private:
|
||||
// Stored by reference. Expression objects must not outlive their operands.
|
||||
// This is safe for the one-liner pattern (d_C = d_A * d_B) since all
|
||||
// temporaries live until the semicolon.
|
||||
const Lhs& lhs_;
|
||||
const Rhs& rhs_;
|
||||
};
|
||||
|
||||
// ---- Free operator* overloads that produce GemmExpr ------------------------
|
||||
// These cover: DM*DM, Adj*DM, DM*Adj, Trans*DM, DM*Trans, Scaled*DM, etc.
|
||||
|
||||
// DeviceMatrix * DeviceMatrix
|
||||
template <typename S>
|
||||
GemmExpr<DeviceMatrix<S>, DeviceMatrix<S>> operator*(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
|
||||
return {a, b};
|
||||
}
|
||||
|
||||
// AdjointView * DeviceMatrix
|
||||
template <typename S>
|
||||
GemmExpr<DeviceAdjointView<S>, DeviceMatrix<S>> operator*(const DeviceAdjointView<S>& a, const DeviceMatrix<S>& b) {
|
||||
return {a, b};
|
||||
}
|
||||
|
||||
// DeviceMatrix * AdjointView
|
||||
template <typename S>
|
||||
GemmExpr<DeviceMatrix<S>, DeviceAdjointView<S>> operator*(const DeviceMatrix<S>& a, const DeviceAdjointView<S>& b) {
|
||||
return {a, b};
|
||||
}
|
||||
|
||||
// TransposeView * DeviceMatrix
|
||||
template <typename S>
|
||||
GemmExpr<DeviceTransposeView<S>, DeviceMatrix<S>> operator*(const DeviceTransposeView<S>& a, const DeviceMatrix<S>& b) {
|
||||
return {a, b};
|
||||
}
|
||||
|
||||
// DeviceMatrix * TransposeView
|
||||
template <typename S>
|
||||
GemmExpr<DeviceMatrix<S>, DeviceTransposeView<S>> operator*(const DeviceMatrix<S>& a, const DeviceTransposeView<S>& b) {
|
||||
return {a, b};
|
||||
}
|
||||
|
||||
// Scaled * DeviceMatrix
|
||||
template <typename Inner, typename S>
|
||||
GemmExpr<DeviceScaled<Inner>, DeviceMatrix<S>> operator*(const DeviceScaled<Inner>& a, const DeviceMatrix<S>& b) {
|
||||
return {a, b};
|
||||
}
|
||||
|
||||
// DeviceMatrix * Scaled
|
||||
template <typename S, typename Inner>
|
||||
GemmExpr<DeviceMatrix<S>, DeviceScaled<Inner>> operator*(const DeviceMatrix<S>& a, const DeviceScaled<Inner>& b) {
|
||||
return {a, b};
|
||||
}
|
||||
|
||||
// ---- Scalar * DeviceMatrix / View → DeviceScaled ---------------------------
|
||||
|
||||
template <typename S>
|
||||
DeviceScaled<DeviceMatrix<S>> operator*(S alpha, const DeviceMatrix<S>& m) {
|
||||
return {alpha, m};
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
DeviceScaled<DeviceAdjointView<S>> operator*(S alpha, const DeviceAdjointView<S>& m) {
|
||||
return {alpha, m};
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
DeviceScaled<DeviceTransposeView<S>> operator*(S alpha, const DeviceTransposeView<S>& m) {
|
||||
return {alpha, m};
|
||||
}
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_DEVICE_EXPR_H
|
||||
517
Eigen/src/GPU/DeviceMatrix.h
Normal file
517
Eigen/src/GPU/DeviceMatrix.h
Normal file
@@ -0,0 +1,517 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Typed RAII wrapper for a dense matrix in GPU device memory.
|
||||
//
|
||||
// DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked
|
||||
// dimensions and leading dimension. It can be passed to GPU solvers
|
||||
// (GpuLLT, GpuLU, future cuBLAS/cuDSS) without host round-trips.
|
||||
//
|
||||
// Cross-stream safety is automatic: an internal CUDA event tracks when the
|
||||
// last write completed. Consumers on a different stream wait on that event
|
||||
// before reading.
|
||||
//
|
||||
// Usage:
|
||||
// auto d_A = DeviceMatrix<double>::fromHost(A); // upload (sync)
|
||||
// GpuLLT<double> llt;
|
||||
// llt.compute(d_A); // factor on device
|
||||
// auto d_X = llt.solve(d_B); // async, no sync
|
||||
// MatrixXd X = d_X.toHost(); // download + block
|
||||
//
|
||||
// Async variants:
|
||||
// auto d_A = DeviceMatrix<double>::fromHostAsync(A.data(), n, n, n, stream);
|
||||
// auto transfer = d_X.toHostAsync(stream); // enqueue D2H
|
||||
// // ... overlap with other work ...
|
||||
// MatrixXd X = transfer.get(); // block + retrieve
|
||||
|
||||
#ifndef EIGEN_GPU_DEVICE_MATRIX_H
|
||||
#define EIGEN_GPU_DEVICE_MATRIX_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./GpuSupport.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// Forward declarations.
|
||||
template <typename, int>
|
||||
class GpuLLT;
|
||||
template <typename>
|
||||
class GpuLU;
|
||||
template <typename>
|
||||
class DeviceAdjointView;
|
||||
template <typename>
|
||||
class DeviceTransposeView;
|
||||
template <typename>
|
||||
class DeviceAssignment;
|
||||
template <typename, typename>
|
||||
class GemmExpr;
|
||||
template <typename, int>
|
||||
class LltSolveExpr;
|
||||
template <typename>
|
||||
class LuSolveExpr;
|
||||
template <typename, int>
|
||||
class DeviceLLTView;
|
||||
template <typename>
|
||||
class DeviceLUView;
|
||||
template <typename, int>
|
||||
class DeviceTriangularView;
|
||||
template <typename, int>
|
||||
class DeviceSelfAdjointView;
|
||||
template <typename, int>
|
||||
class ConstDeviceSelfAdjointView;
|
||||
template <typename, int>
|
||||
class TrsmExpr;
|
||||
template <typename, int>
|
||||
class SymmExpr;
|
||||
template <typename, int>
|
||||
class SyrkExpr;
|
||||
class GpuContext;
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// HostTransfer — future-like wrapper for an async device-to-host transfer.
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
/** \ingroup GPU_Module
|
||||
* \class HostTransfer
|
||||
* \brief Future for an asynchronous device-to-host matrix transfer.
|
||||
*
|
||||
* Returned by DeviceMatrix::toHostAsync(). The transfer runs asynchronously
|
||||
* on the given CUDA stream. Call get() to block until complete and retrieve
|
||||
* the host matrix, or ready() to poll without blocking.
|
||||
*/
|
||||
template <typename Scalar_>
|
||||
class HostTransfer {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
|
||||
/** Block until the transfer completes and return the host matrix.
|
||||
* Idempotent: subsequent calls return the same matrix without re-syncing. */
|
||||
PlainMatrix& get() {
|
||||
if (!synced_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaEventSynchronize(event_));
|
||||
synced_ = true;
|
||||
}
|
||||
return host_buf_;
|
||||
}
|
||||
|
||||
/** Non-blocking check: has the transfer completed? */
|
||||
bool ready() const {
|
||||
if (synced_) return true;
|
||||
cudaError_t err = cudaEventQuery(event_);
|
||||
if (err == cudaSuccess) return true;
|
||||
eigen_assert(err == cudaErrorNotReady && "cudaEventQuery failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
~HostTransfer() {
|
||||
if (event_) (void)cudaEventDestroy(event_);
|
||||
}
|
||||
|
||||
HostTransfer(HostTransfer&& o) noexcept : host_buf_(std::move(o.host_buf_)), event_(o.event_), synced_(o.synced_) {
|
||||
o.event_ = nullptr;
|
||||
o.synced_ = true;
|
||||
}
|
||||
|
||||
HostTransfer& operator=(HostTransfer&& o) noexcept {
|
||||
if (this != &o) {
|
||||
if (event_) (void)cudaEventDestroy(event_);
|
||||
host_buf_ = std::move(o.host_buf_);
|
||||
event_ = o.event_;
|
||||
synced_ = o.synced_;
|
||||
o.event_ = nullptr;
|
||||
o.synced_ = true;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
HostTransfer(const HostTransfer&) = delete;
|
||||
HostTransfer& operator=(const HostTransfer&) = delete;
|
||||
|
||||
private:
|
||||
template <typename>
|
||||
friend class DeviceMatrix;
|
||||
|
||||
HostTransfer(PlainMatrix&& buf, cudaEvent_t event) : host_buf_(std::move(buf)), event_(event), synced_(false) {}
|
||||
|
||||
PlainMatrix host_buf_;
|
||||
cudaEvent_t event_ = nullptr;
|
||||
bool synced_ = false;
|
||||
};
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// DeviceMatrix — typed RAII wrapper for a dense matrix in device memory.
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
/** \ingroup GPU_Module
|
||||
* \class DeviceMatrix
|
||||
* \brief RAII wrapper for a dense column-major matrix in GPU device memory.
|
||||
*
|
||||
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
|
||||
*
|
||||
* Owns a device allocation with tracked dimensions and leading dimension.
|
||||
* An internal CUDA event records when the data was last written, enabling
|
||||
* safe cross-stream consumption without user-visible synchronization.
|
||||
*
|
||||
* Each method has a synchronous and an asynchronous variant:
|
||||
* - fromHost() / fromHostAsync(): upload from host
|
||||
* - toHost() / toHostAsync(): download to host
|
||||
*/
|
||||
template <typename Scalar_>
|
||||
class DeviceMatrix {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
|
||||
// ---- Construction / destruction ------------------------------------------
|
||||
|
||||
/** Default: empty (0x0, no allocation). */
|
||||
DeviceMatrix() = default;
|
||||
|
||||
/** Allocate uninitialized device memory for a rows x cols matrix. */
|
||||
DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols), outerStride_(rows) {
|
||||
eigen_assert(rows >= 0 && cols >= 0);
|
||||
size_t bytes = sizeInBytes();
|
||||
if (bytes > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
|
||||
}
|
||||
}
|
||||
|
||||
~DeviceMatrix() {
|
||||
if (data_) (void)cudaFree(data_);
|
||||
if (ready_event_) (void)cudaEventDestroy(ready_event_);
|
||||
}
|
||||
|
||||
// ---- Move-only -----------------------------------------------------------
|
||||
|
||||
DeviceMatrix(DeviceMatrix&& o) noexcept
|
||||
: data_(o.data_),
|
||||
rows_(o.rows_),
|
||||
cols_(o.cols_),
|
||||
outerStride_(o.outerStride_),
|
||||
ready_event_(o.ready_event_),
|
||||
ready_stream_(o.ready_stream_),
|
||||
retained_buffer_(std::move(o.retained_buffer_)) {
|
||||
o.data_ = nullptr;
|
||||
o.rows_ = 0;
|
||||
o.cols_ = 0;
|
||||
o.outerStride_ = 0;
|
||||
o.ready_event_ = nullptr;
|
||||
o.ready_stream_ = nullptr;
|
||||
}
|
||||
|
||||
DeviceMatrix& operator=(DeviceMatrix&& o) noexcept {
|
||||
if (this != &o) {
|
||||
if (data_) (void)cudaFree(data_);
|
||||
if (ready_event_) (void)cudaEventDestroy(ready_event_);
|
||||
data_ = o.data_;
|
||||
rows_ = o.rows_;
|
||||
cols_ = o.cols_;
|
||||
outerStride_ = o.outerStride_;
|
||||
ready_event_ = o.ready_event_;
|
||||
ready_stream_ = o.ready_stream_;
|
||||
retained_buffer_ = std::move(o.retained_buffer_);
|
||||
o.data_ = nullptr;
|
||||
o.rows_ = 0;
|
||||
o.cols_ = 0;
|
||||
o.outerStride_ = 0;
|
||||
o.ready_event_ = nullptr;
|
||||
o.ready_stream_ = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
DeviceMatrix(const DeviceMatrix&) = delete;
|
||||
DeviceMatrix& operator=(const DeviceMatrix&) = delete;
|
||||
|
||||
// ---- Upload from host ----------------------------------------------------
|
||||
|
||||
/** Upload a host Eigen matrix to device memory (synchronous).
|
||||
*
|
||||
* Evaluates the expression into a contiguous ColMajor temporary, copies to
|
||||
* device via cudaMemcpyAsync on \p stream, and synchronizes before returning.
|
||||
*
|
||||
* \param host Any Eigen matrix expression.
|
||||
* \param stream CUDA stream for the transfer (default: stream 0).
|
||||
*/
|
||||
template <typename Derived>
|
||||
static DeviceMatrix fromHost(const MatrixBase<Derived>& host, cudaStream_t stream = nullptr) {
|
||||
const PlainMatrix mat(host.derived());
|
||||
DeviceMatrix dm(mat.rows(), mat.cols());
|
||||
if (dm.sizeInBytes() > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dm.data_, mat.data(), dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
|
||||
}
|
||||
return dm;
|
||||
}
|
||||
|
||||
/** Upload from a raw host pointer to device memory (asynchronous).
|
||||
*
|
||||
* Enqueues an async H2D copy on \p stream and records an internal event.
|
||||
* The caller must keep \p host_data alive until the transfer completes
|
||||
* (check via the internal event or synchronize the stream).
|
||||
*
|
||||
* \param host_data Pointer to contiguous column-major host data.
|
||||
* \param rows Number of rows.
|
||||
* \param cols Number of columns.
|
||||
* \param outerStride Leading dimension (>= rows). Use rows for dense.
|
||||
* \param stream CUDA stream for the transfer.
|
||||
*/
|
||||
static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, Index outerStride,
|
||||
cudaStream_t stream) {
|
||||
eigen_assert(rows >= 0 && cols >= 0 && outerStride >= rows);
|
||||
eigen_assert(host_data != nullptr || (rows == 0 || cols == 0));
|
||||
DeviceMatrix dm(rows, cols);
|
||||
if (dm.sizeInBytes() > 0) {
|
||||
// If outerStride == rows (dense), single contiguous copy.
|
||||
// Otherwise, copy column by column (strided layout).
|
||||
if (outerStride == rows) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(dm.data_, host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
|
||||
} else {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(dm.data_, static_cast<size_t>(rows) * sizeof(Scalar), host_data,
|
||||
static_cast<size_t>(outerStride) * sizeof(Scalar),
|
||||
static_cast<size_t>(rows) * sizeof(Scalar),
|
||||
static_cast<size_t>(cols), cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
dm.recordReady(stream);
|
||||
}
|
||||
return dm;
|
||||
}
|
||||
|
||||
// ---- Download to host ----------------------------------------------------
|
||||
|
||||
/** Download device matrix to host memory (synchronous).
|
||||
*
|
||||
* Waits on the internal ready event, enqueues a D2H copy on \p stream,
|
||||
* synchronizes, and returns the host matrix directly.
|
||||
*
|
||||
* \param stream CUDA stream for the transfer (default: stream 0).
|
||||
*/
|
||||
PlainMatrix toHost(cudaStream_t stream = nullptr) const {
|
||||
PlainMatrix host_buf(rows_, cols_);
|
||||
if (sizeInBytes() > 0) {
|
||||
waitReady(stream);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(host_buf.data(), data_, sizeInBytes(), cudaMemcpyDeviceToHost, stream));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
|
||||
}
|
||||
return host_buf;
|
||||
}
|
||||
|
||||
/** Enqueue an async device-to-host transfer and return a future.
|
||||
*
|
||||
* Waits on the internal ready event (if any) to ensure the device data is
|
||||
* valid, then enqueues the D2H copy on \p stream. Returns a HostTransfer
|
||||
* future; call .get() to block and retrieve the host matrix.
|
||||
*
|
||||
* \param stream CUDA stream for the transfer (default: stream 0).
|
||||
*/
|
||||
HostTransfer<Scalar> toHostAsync(cudaStream_t stream = nullptr) const {
|
||||
PlainMatrix host_buf(rows_, cols_);
|
||||
if (sizeInBytes() > 0) {
|
||||
waitReady(stream);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(host_buf.data(), data_, sizeInBytes(), cudaMemcpyDeviceToHost, stream));
|
||||
}
|
||||
// Record a transfer-complete event.
|
||||
cudaEvent_t transfer_event;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&transfer_event, cudaEventDisableTiming));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(transfer_event, stream));
|
||||
return HostTransfer<Scalar>(std::move(host_buf), transfer_event);
|
||||
}
|
||||
|
||||
// ---- Device-to-device copy -----------------------------------------------
|
||||
|
||||
/** Deep copy on device. Fully async — records event on the result, no sync.
|
||||
*
|
||||
* \param stream CUDA stream for the D2D copy (default: stream 0).
|
||||
*/
|
||||
DeviceMatrix clone(cudaStream_t stream = nullptr) const {
|
||||
DeviceMatrix result(rows_, cols_);
|
||||
if (sizeInBytes() > 0) {
|
||||
waitReady(stream);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data_, data_, sizeInBytes(), cudaMemcpyDeviceToDevice, stream));
|
||||
result.recordReady(stream);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// ---- Resize (destructive) ------------------------------------------------
|
||||
|
||||
/** Discard contents and reallocate to (rows x cols). Clears the ready event. */
|
||||
void resize(Index rows, Index cols) {
|
||||
if (rows == rows_ && cols == cols_) return;
|
||||
if (data_) {
|
||||
(void)cudaFree(data_);
|
||||
data_ = nullptr;
|
||||
}
|
||||
if (ready_event_) {
|
||||
(void)cudaEventDestroy(ready_event_);
|
||||
ready_event_ = nullptr;
|
||||
}
|
||||
ready_stream_ = nullptr;
|
||||
retained_buffer_ = internal::DeviceBuffer();
|
||||
rows_ = rows;
|
||||
cols_ = cols;
|
||||
outerStride_ = rows;
|
||||
size_t bytes = sizeInBytes();
|
||||
if (bytes > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Accessors -----------------------------------------------------------
|
||||
|
||||
Scalar* data() { return data_; }
|
||||
const Scalar* data() const { return data_; }
|
||||
Index rows() const { return rows_; }
|
||||
Index cols() const { return cols_; }
|
||||
Index outerStride() const { return outerStride_; }
|
||||
bool empty() const { return rows_ == 0 || cols_ == 0; }
|
||||
|
||||
/** Size of the device allocation in bytes. */
|
||||
size_t sizeInBytes() const { return static_cast<size_t>(outerStride_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
|
||||
|
||||
// ---- Event synchronization (public for library dispatch interop) ---------
|
||||
|
||||
/** Record that device data is ready after work on \p stream. */
|
||||
void recordReady(cudaStream_t stream) {
|
||||
ensureEvent();
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(ready_event_, stream));
|
||||
ready_stream_ = stream;
|
||||
}
|
||||
|
||||
/** Make \p stream wait until the device data is ready.
|
||||
* No-op if no event recorded, or if the consumer stream is the same as the
|
||||
* producer stream (CUDA guarantees in-order execution within a stream). */
|
||||
void waitReady(cudaStream_t stream) const {
|
||||
if (ready_event_ && stream != ready_stream_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamWaitEvent(stream, ready_event_, 0));
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Expression methods (dispatch to cuBLAS/cuSOLVER) --------------------
|
||||
|
||||
/** Adjoint view for GEMM dispatch. Maps to cublasXgemm with ConjTrans. */
|
||||
DeviceAdjointView<Scalar> adjoint() const { return DeviceAdjointView<Scalar>(*this); }
|
||||
|
||||
/** Transpose view for GEMM dispatch. Maps to cublasXgemm with Trans. */
|
||||
DeviceTransposeView<Scalar> transpose() const { return DeviceTransposeView<Scalar>(*this); }
|
||||
|
||||
/** Bind this matrix to a GpuContext for expression assignment.
|
||||
* Returns a DeviceAssignment proxy: `d_C.device(ctx) = d_A * d_B;` */
|
||||
DeviceAssignment<Scalar> device(GpuContext& ctx) { return DeviceAssignment<Scalar>(*this, ctx); }
|
||||
|
||||
/** Assign from a GEMM expression using the thread-local default GpuContext.
|
||||
* Defined out-of-line after GpuContext is fully declared (see DeviceDispatch.h). */
|
||||
template <typename Lhs, typename Rhs>
|
||||
DeviceMatrix& operator=(const GemmExpr<Lhs, Rhs>& expr);
|
||||
|
||||
/** Accumulate from a GEMM expression using the thread-local default GpuContext. */
|
||||
template <typename Lhs, typename Rhs>
|
||||
DeviceMatrix& operator+=(const GemmExpr<Lhs, Rhs>& expr);
|
||||
|
||||
/** Cholesky view: d_A.llt().solve(d_B) → LltSolveExpr. */
|
||||
DeviceLLTView<Scalar, Lower> llt() const { return DeviceLLTView<Scalar, Lower>(*this); }
|
||||
|
||||
/** Cholesky view with explicit triangle: d_A.llt<Upper>().solve(d_B). */
|
||||
template <int UpLo>
|
||||
DeviceLLTView<Scalar, UpLo> llt() const {
|
||||
return DeviceLLTView<Scalar, UpLo>(*this);
|
||||
}
|
||||
|
||||
/** LU view: d_A.lu().solve(d_B) → LuSolveExpr. */
|
||||
DeviceLUView<Scalar> lu() const { return DeviceLUView<Scalar>(*this); }
|
||||
|
||||
/** Assign from an LLT solve expression (thread-local default context). */
|
||||
template <int UpLo>
|
||||
DeviceMatrix& operator=(const LltSolveExpr<Scalar, UpLo>& expr);
|
||||
|
||||
/** Assign from an LU solve expression (thread-local default context). */
|
||||
DeviceMatrix& operator=(const LuSolveExpr<Scalar>& expr);
|
||||
|
||||
/** Triangular view: d_A.triangularView<Lower>().solve(d_B) → TrsmExpr. */
|
||||
template <int UpLo>
|
||||
DeviceTriangularView<Scalar, UpLo> triangularView() const {
|
||||
return DeviceTriangularView<Scalar, UpLo>(*this);
|
||||
}
|
||||
|
||||
/** Self-adjoint view (mutable): d_C.selfadjointView<Lower>().rankUpdate(d_A). */
|
||||
template <int UpLo>
|
||||
DeviceSelfAdjointView<Scalar, UpLo> selfadjointView() {
|
||||
return DeviceSelfAdjointView<Scalar, UpLo>(*this);
|
||||
}
|
||||
|
||||
/** Self-adjoint view (const): d_A.selfadjointView<Lower>() * d_B → SymmExpr. */
|
||||
template <int UpLo>
|
||||
ConstDeviceSelfAdjointView<Scalar, UpLo> selfadjointView() const {
|
||||
return ConstDeviceSelfAdjointView<Scalar, UpLo>(*this);
|
||||
}
|
||||
|
||||
/** Assign from a TRSM expression (thread-local default context). */
|
||||
template <int UpLo>
|
||||
DeviceMatrix& operator=(const TrsmExpr<Scalar, UpLo>& expr);
|
||||
|
||||
/** Assign from a SYMM expression (thread-local default context). */
|
||||
template <int UpLo>
|
||||
DeviceMatrix& operator=(const SymmExpr<Scalar, UpLo>& expr);
|
||||
|
||||
private:
|
||||
// ---- Private: adopt a raw device pointer (used by friend solvers) --------
|
||||
|
||||
DeviceMatrix(Scalar* device_ptr, Index rows, Index cols, Index outerStride)
|
||||
: data_(device_ptr), rows_(rows), cols_(cols), outerStride_(outerStride) {}
|
||||
|
||||
/** Transfer ownership of the device pointer out. Zeros internal state. */
|
||||
Scalar* release() {
|
||||
Scalar* p = data_;
|
||||
data_ = nullptr;
|
||||
rows_ = 0;
|
||||
cols_ = 0;
|
||||
outerStride_ = 0;
|
||||
if (ready_event_) {
|
||||
(void)cudaEventDestroy(ready_event_);
|
||||
ready_event_ = nullptr;
|
||||
}
|
||||
ready_stream_ = nullptr;
|
||||
return p;
|
||||
}
|
||||
|
||||
// ---- Private helpers -------------------------------------------------------
|
||||
|
||||
void ensureEvent() {
|
||||
if (!ready_event_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&ready_event_, cudaEventDisableTiming));
|
||||
}
|
||||
}
|
||||
|
||||
void retainBuffer(internal::DeviceBuffer&& buffer) { retained_buffer_ = std::move(buffer); }
|
||||
|
||||
// ---- Friend declarations ------------------------------------------------
|
||||
|
||||
template <typename, int>
|
||||
friend class GpuLLT;
|
||||
template <typename>
|
||||
friend class GpuLU;
|
||||
|
||||
// ---- Data members --------------------------------------------------------
|
||||
|
||||
Scalar* data_ = nullptr;
|
||||
Index rows_ = 0;
|
||||
Index cols_ = 0;
|
||||
Index outerStride_ = 0;
|
||||
cudaEvent_t ready_event_ = nullptr; // internal: tracks last write completion
|
||||
cudaStream_t ready_stream_ = nullptr; // stream that recorded ready_event_ (for same-stream skip)
|
||||
internal::DeviceBuffer retained_buffer_; // internal: keeps async aux buffers alive
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_DEVICE_MATRIX_H
|
||||
115
Eigen/src/GPU/DeviceSolverExpr.h
Normal file
115
Eigen/src/GPU/DeviceSolverExpr.h
Normal file
@@ -0,0 +1,115 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Solver expression types for DeviceMatrix.
|
||||
//
|
||||
// Each expression maps 1:1 to cuSOLVER library calls:
|
||||
// LltSolveExpr → cusolverDnXpotrf + cusolverDnXpotrs
|
||||
// LuSolveExpr → cusolverDnXgetrf + cusolverDnXgetrs
|
||||
//
|
||||
// Usage:
|
||||
// d_X = d_A.llt().solve(d_B); // Cholesky solve
|
||||
// d_X.device(ctx) = d_A.lu().solve(d_B); // LU solve on explicit stream
|
||||
|
||||
#ifndef EIGEN_GPU_DEVICE_SOLVER_EXPR_H
|
||||
#define EIGEN_GPU_DEVICE_SOLVER_EXPR_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// Forward declarations.
|
||||
template <typename Scalar_>
|
||||
class DeviceMatrix;
|
||||
class GpuContext;
|
||||
|
||||
// ---- LLT solve expression ---------------------------------------------------
|
||||
// d_A.llt().solve(d_B) → LltSolveExpr → cusolverDnXpotrf + cusolverDnXpotrs
|
||||
|
||||
template <typename Scalar_, int UpLo_ = Lower>
|
||||
class LltSolveExpr {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
LltSolveExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||
const DeviceMatrix<Scalar>& rhs() const { return B_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& A_;
|
||||
const DeviceMatrix<Scalar>& B_;
|
||||
};
|
||||
|
||||
// ---- LU solve expression ----------------------------------------------------
|
||||
// d_A.lu().solve(d_B) → LuSolveExpr → cusolverDnXgetrf + cusolverDnXgetrs
|
||||
|
||||
template <typename Scalar_>
|
||||
class LuSolveExpr {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
|
||||
LuSolveExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
|
||||
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||
const DeviceMatrix<Scalar>& rhs() const { return B_; }
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& A_;
|
||||
const DeviceMatrix<Scalar>& B_;
|
||||
};
|
||||
|
||||
// ---- DeviceLLTView: d_A.llt() → view with .solve() and .device() -----------
|
||||
|
||||
template <typename Scalar_, int UpLo_ = Lower>
|
||||
class DeviceLLTView {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
|
||||
explicit DeviceLLTView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||
|
||||
/** Build a solve expression: d_A.llt().solve(d_B).
|
||||
* The expression is evaluated when assigned to a DeviceMatrix. */
|
||||
LltSolveExpr<Scalar, UpLo_> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
|
||||
|
||||
// For cached factorizations, use the explicit GpuLLT API directly:
|
||||
// GpuLLT<double> llt;
|
||||
// llt.compute(d_A);
|
||||
// auto d_X1 = llt.solve(d_B1);
|
||||
// auto d_X2 = llt.solve(d_B2);
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& mat_;
|
||||
};
|
||||
|
||||
// ---- DeviceLUView: d_A.lu() → view with .solve() and .device() -------------
|
||||
|
||||
template <typename Scalar_>
|
||||
class DeviceLUView {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
|
||||
explicit DeviceLUView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||
|
||||
/** Build a solve expression: d_A.lu().solve(d_B). */
|
||||
LuSolveExpr<Scalar> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
|
||||
|
||||
// For cached factorizations, use the explicit GpuLU API directly:
|
||||
// GpuLU<double> lu;
|
||||
// lu.compute(d_A);
|
||||
// auto d_X1 = lu.solve(d_B1);
|
||||
// auto d_X2 = lu.solve(d_B2);
|
||||
|
||||
private:
|
||||
const DeviceMatrix<Scalar>& mat_;
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_DEVICE_SOLVER_EXPR_H
|
||||
83
Eigen/src/GPU/GpuContext.h
Normal file
83
Eigen/src/GPU/GpuContext.h
Normal file
@@ -0,0 +1,83 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Unified GPU execution context.
|
||||
//
|
||||
// GpuContext owns a CUDA stream and all NVIDIA library handles (cuBLAS,
|
||||
// cuSOLVER, future cuDSS/cuSPARSE). It is the entry point for all GPU
|
||||
// operations on DeviceMatrix.
|
||||
//
|
||||
// Usage:
|
||||
// GpuContext ctx; // explicit context
|
||||
// d_C.device(ctx) = d_A * d_B; // GEMM on ctx's stream
|
||||
//
|
||||
// d_C = d_A * d_B; // thread-local default context
|
||||
// GpuContext& ctx = GpuContext::threadLocal();
|
||||
|
||||
#ifndef EIGEN_GPU_CONTEXT_H
|
||||
#define EIGEN_GPU_CONTEXT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuBlasSupport.h"
|
||||
#include "./CuSolverSupport.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \ingroup GPU_Module
|
||||
* \class GpuContext
|
||||
* \brief Unified GPU execution context owning a CUDA stream and library handles.
|
||||
*
|
||||
* Each GpuContext instance creates a dedicated CUDA stream, a cuBLAS handle,
|
||||
* and a cuSOLVER handle, all bound to that stream. Multiple contexts enable
|
||||
* concurrent execution on independent streams.
|
||||
*
|
||||
* A lazily-created thread-local default is available via threadLocal() for
|
||||
* simple single-stream usage.
|
||||
*/
|
||||
class GpuContext {
|
||||
public:
|
||||
GpuContext() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
|
||||
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
|
||||
}
|
||||
|
||||
~GpuContext() {
|
||||
if (cusolver_) (void)cusolverDnDestroy(cusolver_);
|
||||
if (cublas_) (void)cublasDestroy(cublas_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
// Non-copyable, non-movable (owns library handles).
|
||||
GpuContext(const GpuContext&) = delete;
|
||||
GpuContext& operator=(const GpuContext&) = delete;
|
||||
|
||||
/** Lazily-created thread-local default context. */
|
||||
static GpuContext& threadLocal() {
|
||||
thread_local GpuContext ctx;
|
||||
return ctx;
|
||||
}
|
||||
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
cublasHandle_t cublasHandle() const { return cublas_; }
|
||||
cusolverDnHandle_t cusolverHandle() const { return cusolver_; }
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cublasHandle_t cublas_ = nullptr;
|
||||
cusolverDnHandle_t cusolver_ = nullptr;
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_CONTEXT_H
|
||||
385
Eigen/src/GPU/GpuLLT.h
Normal file
385
Eigen/src/GPU/GpuLLT.h
Normal file
@@ -0,0 +1,385 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Eigen Authors
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU Cholesky (LLT) decomposition using cuSOLVER.
|
||||
//
|
||||
// Unlike Eigen's CPU LLT<MatrixType>, GpuLLT keeps the factored Cholesky
|
||||
// factor in device memory for the lifetime of the object. Multiple solves
|
||||
// against the same factor therefore only transfer the RHS and solution
|
||||
// vectors, not the factor itself.
|
||||
//
|
||||
// Requires CUDA 11.0+ (cusolverDnXpotrf / cusolverDnXpotrs generic API).
|
||||
// Requires CUDA 11.4+ (cusolverDnX generic API + cudaMallocAsync).
|
||||
//
|
||||
// Usage:
|
||||
// GpuLLT<double> llt(A); // upload A, potrf, L stays on device
|
||||
// if (llt.info() != Success) { ... }
|
||||
// MatrixXd x1 = llt.solve(b1); // potrs, only b1 transferred
|
||||
// MatrixXd x2 = llt.solve(b2); // L already on device
|
||||
|
||||
#ifndef EIGEN_GPU_LLT_H
|
||||
#define EIGEN_GPU_LLT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuSolverSupport.h"
|
||||
#include <vector>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \ingroup GPU_Module
|
||||
* \class GpuLLT
|
||||
* \brief GPU Cholesky (LL^T) decomposition via cuSOLVER
|
||||
*
|
||||
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
|
||||
* \tparam UpLo_ Triangle used: Lower (default) or Upper
|
||||
*
|
||||
* Factorizes a symmetric positive-definite matrix A = LL^H on the GPU and
|
||||
* caches the factor L in device memory. Each subsequent solve(B) uploads only
|
||||
* B, calls cusolverDnXpotrs, and downloads the result — the factor is not
|
||||
* re-transferred.
|
||||
*
|
||||
* Each GpuLLT object owns a dedicated CUDA stream and cuSOLVER handle,
|
||||
* enabling concurrent factorizations from multiple objects on the same host
|
||||
* thread.
|
||||
*/
|
||||
template <typename Scalar_, int UpLo_ = Lower>
|
||||
class GpuLLT {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
|
||||
enum { UpLo = UpLo_ };
|
||||
|
||||
// ---- Construction / destruction ------------------------------------------
|
||||
|
||||
/** Default constructor. Does not factorize; call compute() before solve(). */
|
||||
GpuLLT() { init_context(); }
|
||||
|
||||
/** Factor A immediately. Equivalent to GpuLLT llt; llt.compute(A). */
|
||||
template <typename InputType>
|
||||
explicit GpuLLT(const EigenBase<InputType>& A) {
|
||||
init_context();
|
||||
compute(A);
|
||||
}
|
||||
|
||||
~GpuLLT() {
|
||||
// Ignore errors in destructors — cannot propagate.
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
// Non-copyable (owns device memory and library handles).
|
||||
GpuLLT(const GpuLLT&) = delete;
|
||||
GpuLLT& operator=(const GpuLLT&) = delete;
|
||||
|
||||
// Movable.
|
||||
GpuLLT(GpuLLT&& o) noexcept
|
||||
: stream_(o.stream_),
|
||||
handle_(o.handle_),
|
||||
params_(std::move(o.params_)),
|
||||
d_factor_(std::move(o.d_factor_)),
|
||||
factor_alloc_size_(o.factor_alloc_size_),
|
||||
d_scratch_(std::move(o.d_scratch_)),
|
||||
scratch_size_(o.scratch_size_),
|
||||
h_workspace_(std::move(o.h_workspace_)),
|
||||
n_(o.n_),
|
||||
lda_(o.lda_),
|
||||
info_(o.info_),
|
||||
info_word_(o.info_word_),
|
||||
info_synced_(o.info_synced_) {
|
||||
o.stream_ = nullptr;
|
||||
o.handle_ = nullptr;
|
||||
o.factor_alloc_size_ = 0;
|
||||
o.scratch_size_ = 0;
|
||||
o.n_ = 0;
|
||||
o.info_ = InvalidInput;
|
||||
o.info_word_ = 0;
|
||||
o.info_synced_ = true;
|
||||
}
|
||||
|
||||
GpuLLT& operator=(GpuLLT&& o) noexcept {
|
||||
if (this != &o) {
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
stream_ = o.stream_;
|
||||
handle_ = o.handle_;
|
||||
params_ = std::move(o.params_);
|
||||
d_factor_ = std::move(o.d_factor_);
|
||||
factor_alloc_size_ = o.factor_alloc_size_;
|
||||
d_scratch_ = std::move(o.d_scratch_);
|
||||
scratch_size_ = o.scratch_size_;
|
||||
h_workspace_ = std::move(o.h_workspace_);
|
||||
n_ = o.n_;
|
||||
lda_ = o.lda_;
|
||||
info_ = o.info_;
|
||||
info_word_ = o.info_word_;
|
||||
info_synced_ = o.info_synced_;
|
||||
o.stream_ = nullptr;
|
||||
o.handle_ = nullptr;
|
||||
o.factor_alloc_size_ = 0;
|
||||
o.scratch_size_ = 0;
|
||||
o.n_ = 0;
|
||||
o.info_ = InvalidInput;
|
||||
o.info_word_ = 0;
|
||||
o.info_synced_ = true;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Factorization -------------------------------------------------------
|
||||
|
||||
/** Compute the Cholesky factorization of A (host matrix).
|
||||
*
|
||||
* Uploads A to device memory, calls cusolverDnXpotrf, and retains the
|
||||
* factored matrix on device. Any previous factorization is overwritten.
|
||||
*/
|
||||
template <typename InputType>
|
||||
GpuLLT& compute(const EigenBase<InputType>& A) {
|
||||
eigen_assert(A.rows() == A.cols());
|
||||
if (!begin_compute(A.rows())) return *this;
|
||||
|
||||
// Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions).
|
||||
const PlainMatrix mat(A.derived());
|
||||
lda_ = static_cast<int64_t>(mat.outerStride());
|
||||
allocate_factor_storage();
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_factor_.ptr, mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** Compute the Cholesky factorization from a device-resident matrix (D2D copy). */
|
||||
GpuLLT& compute(const DeviceMatrix<Scalar>& d_A) {
|
||||
eigen_assert(d_A.rows() == d_A.cols());
|
||||
if (!begin_compute(d_A.rows())) return *this;
|
||||
|
||||
lda_ = static_cast<int64_t>(d_A.outerStride());
|
||||
d_A.waitReady(stream_);
|
||||
allocate_factor_storage();
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_factor_.ptr, d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** Compute the Cholesky factorization from a device matrix (move, no copy). */
|
||||
GpuLLT& compute(DeviceMatrix<Scalar>&& d_A) {
|
||||
eigen_assert(d_A.rows() == d_A.cols());
|
||||
if (!begin_compute(d_A.rows())) return *this;
|
||||
|
||||
lda_ = static_cast<int64_t>(d_A.outerStride());
|
||||
d_A.waitReady(stream_);
|
||||
d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Solve ---------------------------------------------------------------
|
||||
|
||||
/** Solve A * X = B using the cached Cholesky factor (host → host).
|
||||
*
|
||||
* Uploads B to device memory, calls cusolverDnXpotrs using the factor
|
||||
* retained from compute(), and returns the solution X on the host.
|
||||
* The factor is not re-transferred; only B goes up and X comes down.
|
||||
*
|
||||
* \pre compute() must have been called and info() == Success.
|
||||
* \returns X such that A * X ≈ B
|
||||
*/
|
||||
template <typename Rhs>
|
||||
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
|
||||
const_cast<GpuLLT*>(this)->sync_info();
|
||||
eigen_assert(info_ == Success && "GpuLLT::solve called on a failed or uninitialized factorization");
|
||||
eigen_assert(B.rows() == n_);
|
||||
|
||||
const PlainMatrix rhs(B);
|
||||
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
|
||||
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_x_ptr, rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
|
||||
});
|
||||
|
||||
PlainMatrix X(n_, B.cols());
|
||||
int solve_info = 0;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
|
||||
eigen_assert(solve_info == 0 && "cusolverDnXpotrs reported an error");
|
||||
return X;
|
||||
}
|
||||
|
||||
/** Solve A * X = B with device-resident RHS. Fully async.
|
||||
*
|
||||
* All work is enqueued on this solver's stream. Returns a DeviceMatrix
|
||||
* with a recorded ready event — no host synchronization occurs.
|
||||
* The caller should check info() after compute() to verify the
|
||||
* factorization succeeded; this method does not check.
|
||||
*/
|
||||
DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
|
||||
eigen_assert(d_B.rows() == n_);
|
||||
d_B.waitReady(stream_);
|
||||
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
|
||||
return solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_x_ptr, d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
|
||||
});
|
||||
}
|
||||
|
||||
// ---- Accessors -----------------------------------------------------------
|
||||
|
||||
/** Returns Success if the last compute() succeeded, NumericalIssue otherwise.
|
||||
* Lazily synchronizes the stream on first call after compute(). */
|
||||
ComputationInfo info() const {
|
||||
const_cast<GpuLLT*>(this)->sync_info();
|
||||
return info_;
|
||||
}
|
||||
|
||||
Index rows() const { return n_; }
|
||||
Index cols() const { return n_; }
|
||||
|
||||
/** Returns the CUDA stream owned by this object.
|
||||
* Advanced users may submit additional GPU work on this stream
|
||||
* to overlap with or chain after GpuLLT operations. */
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cusolverDnHandle_t handle_ = nullptr;
|
||||
internal::CusolverParams params_; // cuSOLVER params (created once, reused)
|
||||
internal::DeviceBuffer d_factor_; // factored L (or U) on device (grows, never shrinks)
|
||||
size_t factor_alloc_size_ = 0; // current d_factor_ allocation size
|
||||
internal::DeviceBuffer d_scratch_; // combined workspace + info word (grows, never shrinks)
|
||||
size_t scratch_size_ = 0; // current scratch allocation size
|
||||
std::vector<char> h_workspace_; // host workspace (kept alive until next compute)
|
||||
Index n_ = 0;
|
||||
int64_t lda_ = 0;
|
||||
ComputationInfo info_ = InvalidInput;
|
||||
int info_word_ = 0; // host-side target for async info download
|
||||
bool info_synced_ = true; // has the stream been synced for info?
|
||||
|
||||
bool begin_compute(Index rows) {
|
||||
n_ = rows;
|
||||
info_ = InvalidInput;
|
||||
if (n_ == 0) {
|
||||
info_ = Success;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t factorBytes() const { return rhsBytes(static_cast<int64_t>(n_), lda_); }
|
||||
|
||||
static size_t rhsBytes(int64_t cols, int64_t outer_stride) {
|
||||
return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
|
||||
}
|
||||
|
||||
void allocate_factor_storage() {
|
||||
size_t needed = factorBytes();
|
||||
if (needed > factor_alloc_size_) {
|
||||
d_factor_ = internal::DeviceBuffer(needed);
|
||||
factor_alloc_size_ = needed;
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure d_scratch_ is at least `workspace_bytes + sizeof(int)`.
|
||||
// Layout: [workspace (workspace_bytes) | info_word (sizeof(int))].
|
||||
// Ensure d_scratch_ can hold workspace_bytes + an aligned info word.
|
||||
// Grows but never shrinks. Syncs the stream before reallocating to
|
||||
// avoid freeing memory that async kernels may still be using.
|
||||
void ensure_scratch(size_t workspace_bytes) {
|
||||
// Round up so the info word is naturally aligned.
|
||||
// 16-byte alignment for optimal GPU memory access.
|
||||
constexpr size_t kAlign = 16;
|
||||
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||
size_t needed = workspace_bytes + sizeof(int);
|
||||
if (needed > scratch_size_) {
|
||||
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
d_scratch_ = internal::DeviceBuffer(needed);
|
||||
scratch_size_ = needed;
|
||||
}
|
||||
}
|
||||
|
||||
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||
int* scratch_info() const {
|
||||
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||
}
|
||||
|
||||
template <typename CopyRhs>
|
||||
DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, CopyRhs&& copy_rhs) const {
|
||||
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||
constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_, ColMajor>::value;
|
||||
|
||||
Scalar* d_x_ptr = nullptr;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_x_ptr), rhsBytes(nrhs, ldb)));
|
||||
copy_rhs(d_x_ptr);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, static_cast<int64_t>(n_), nrhs, dtype,
|
||||
d_factor_.ptr, lda_, dtype, d_x_ptr, ldb, scratch_info()));
|
||||
|
||||
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs), static_cast<Index>(ldb));
|
||||
result.recordReady(stream_);
|
||||
return result;
|
||||
}
|
||||
|
||||
void init_context() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||
ensure_scratch(0); // allocate at least the info word
|
||||
}
|
||||
|
||||
// Synchronize stream and interpret the info word. No-op if already synced.
|
||||
void sync_info() {
|
||||
if (!info_synced_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||
info_synced_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Run cusolverDnXpotrf on d_factor_ (already on device).
|
||||
// Enqueues factorization + async info download. Does NOT sync.
|
||||
// Workspaces are stored as members to ensure they outlive the async kernels.
|
||||
void factorize() {
|
||||
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||
constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_, ColMajor>::value;
|
||||
|
||||
info_synced_ = false;
|
||||
info_ = InvalidInput;
|
||||
|
||||
size_t dev_ws_bytes = 0, host_ws_bytes = 0;
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(handle_, params_.p, uplo, static_cast<int64_t>(n_), dtype,
|
||||
d_factor_.ptr, lda_, dtype, &dev_ws_bytes, &host_ws_bytes));
|
||||
|
||||
ensure_scratch(dev_ws_bytes);
|
||||
h_workspace_.resize(host_ws_bytes);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(
|
||||
handle_, params_.p, uplo, static_cast<int64_t>(n_), dtype, d_factor_.ptr, lda_, dtype, scratch_workspace(),
|
||||
dev_ws_bytes, host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info()));
|
||||
|
||||
// Enqueue async download of info word — sync deferred to info() or solve().
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_LLT_H
|
||||
371
Eigen/src/GPU/GpuLU.h
Normal file
371
Eigen/src/GPU/GpuLU.h
Normal file
@@ -0,0 +1,371 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Eigen Authors
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU partial-pivoting LU decomposition using cuSOLVER.
|
||||
//
|
||||
// Wraps cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve).
|
||||
// The factored LU matrix and pivot array are kept in device memory for the
|
||||
// lifetime of the object, so repeated solves only transfer the RHS/solution.
|
||||
//
|
||||
// Requires CUDA 11.0+ (cusolverDnX generic API).
|
||||
//
|
||||
// Usage:
|
||||
// GpuLU<double> lu(A); // upload A, getrf, LU+ipiv on device
|
||||
// if (lu.info() != Success) { ... }
|
||||
// MatrixXd x = lu.solve(b); // getrs NoTrans, only b transferred
|
||||
// MatrixXd xt = lu.solve(b, GpuLU<double>::Transpose); // A^T x = b
|
||||
|
||||
#ifndef EIGEN_GPU_LU_H
|
||||
#define EIGEN_GPU_LU_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuSolverSupport.h"
|
||||
#include <vector>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \ingroup GPU_Module
|
||||
* \class GpuLU
|
||||
* \brief GPU LU decomposition with partial pivoting via cuSOLVER
|
||||
*
|
||||
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
|
||||
*
|
||||
* Decomposes a square matrix A = P L U on the GPU and retains the factored
|
||||
* matrix and pivot array in device memory. Solves A*X=B, A^T*X=B, or
|
||||
* A^H*X=B by passing the appropriate TransposeMode.
|
||||
*
|
||||
* Each GpuLU object owns a dedicated CUDA stream and cuSOLVER handle.
|
||||
*/
|
||||
template <typename Scalar_>
|
||||
class GpuLU {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
|
||||
/** Controls which system is solved in solve(). */
|
||||
enum TransposeMode {
|
||||
NoTranspose, ///< Solve A * X = B
|
||||
Transpose, ///< Solve A^T * X = B
|
||||
ConjugateTranspose ///< Solve A^H * X = B (same as Transpose for real types)
|
||||
};
|
||||
|
||||
// ---- Construction / destruction ------------------------------------------
|
||||
|
||||
GpuLU() { init_context(); }
|
||||
|
||||
template <typename InputType>
|
||||
explicit GpuLU(const EigenBase<InputType>& A) {
|
||||
init_context();
|
||||
compute(A);
|
||||
}
|
||||
|
||||
~GpuLU() {
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
GpuLU(const GpuLU&) = delete;
|
||||
GpuLU& operator=(const GpuLU&) = delete;
|
||||
|
||||
GpuLU(GpuLU&& o) noexcept
|
||||
: stream_(o.stream_),
|
||||
handle_(o.handle_),
|
||||
params_(std::move(o.params_)),
|
||||
d_lu_(std::move(o.d_lu_)),
|
||||
lu_alloc_size_(o.lu_alloc_size_),
|
||||
d_ipiv_(std::move(o.d_ipiv_)),
|
||||
d_scratch_(std::move(o.d_scratch_)),
|
||||
scratch_size_(o.scratch_size_),
|
||||
h_workspace_(std::move(o.h_workspace_)),
|
||||
n_(o.n_),
|
||||
lda_(o.lda_),
|
||||
info_(o.info_),
|
||||
info_word_(o.info_word_),
|
||||
info_synced_(o.info_synced_) {
|
||||
o.stream_ = nullptr;
|
||||
o.handle_ = nullptr;
|
||||
o.lu_alloc_size_ = 0;
|
||||
o.scratch_size_ = 0;
|
||||
o.n_ = 0;
|
||||
o.info_ = InvalidInput;
|
||||
o.info_word_ = 0;
|
||||
o.info_synced_ = true;
|
||||
}
|
||||
|
||||
GpuLU& operator=(GpuLU&& o) noexcept {
|
||||
if (this != &o) {
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
stream_ = o.stream_;
|
||||
handle_ = o.handle_;
|
||||
params_ = std::move(o.params_);
|
||||
d_lu_ = std::move(o.d_lu_);
|
||||
lu_alloc_size_ = o.lu_alloc_size_;
|
||||
d_ipiv_ = std::move(o.d_ipiv_);
|
||||
d_scratch_ = std::move(o.d_scratch_);
|
||||
scratch_size_ = o.scratch_size_;
|
||||
h_workspace_ = std::move(o.h_workspace_);
|
||||
n_ = o.n_;
|
||||
lda_ = o.lda_;
|
||||
info_ = o.info_;
|
||||
info_word_ = o.info_word_;
|
||||
info_synced_ = o.info_synced_;
|
||||
o.stream_ = nullptr;
|
||||
o.handle_ = nullptr;
|
||||
o.lu_alloc_size_ = 0;
|
||||
o.scratch_size_ = 0;
|
||||
o.n_ = 0;
|
||||
o.info_ = InvalidInput;
|
||||
o.info_word_ = 0;
|
||||
o.info_synced_ = true;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Factorization -------------------------------------------------------
|
||||
|
||||
/** Compute the LU factorization of A (host matrix, must be square). */
|
||||
template <typename InputType>
|
||||
GpuLU& compute(const EigenBase<InputType>& A) {
|
||||
eigen_assert(A.rows() == A.cols() && "GpuLU requires a square matrix");
|
||||
if (!begin_compute(A.rows())) return *this;
|
||||
|
||||
const PlainMatrix mat(A.derived());
|
||||
lda_ = static_cast<int64_t>(mat.outerStride());
|
||||
allocate_lu_storage();
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** Compute the LU factorization from a device-resident matrix (D2D copy). */
|
||||
GpuLU& compute(const DeviceMatrix<Scalar>& d_A) {
|
||||
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
|
||||
if (!begin_compute(d_A.rows())) return *this;
|
||||
|
||||
lda_ = static_cast<int64_t>(d_A.outerStride());
|
||||
d_A.waitReady(stream_);
|
||||
allocate_lu_storage();
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** Compute the LU factorization from a device matrix (move, no copy). */
|
||||
GpuLU& compute(DeviceMatrix<Scalar>&& d_A) {
|
||||
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
|
||||
if (!begin_compute(d_A.rows())) return *this;
|
||||
|
||||
lda_ = static_cast<int64_t>(d_A.outerStride());
|
||||
d_A.waitReady(stream_);
|
||||
d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Solve ---------------------------------------------------------------
|
||||
|
||||
/** Solve op(A) * X = B using the cached LU factorization (host → host).
|
||||
*
|
||||
* \param B Right-hand side (n x nrhs host matrix).
|
||||
* \param mode NoTranspose (default), Transpose, or ConjugateTranspose.
|
||||
*/
|
||||
template <typename Rhs>
|
||||
PlainMatrix solve(const MatrixBase<Rhs>& B, TransposeMode mode = NoTranspose) const {
|
||||
const_cast<GpuLU*>(this)->sync_info();
|
||||
eigen_assert(info_ == Success && "GpuLU::solve called on a failed or uninitialized factorization");
|
||||
eigen_assert(B.rows() == n_);
|
||||
|
||||
const PlainMatrix rhs(B);
|
||||
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
|
||||
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_x_ptr, rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
|
||||
});
|
||||
|
||||
PlainMatrix X(n_, B.cols());
|
||||
int solve_info = 0;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(X.data(), d_X.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
|
||||
eigen_assert(solve_info == 0 && "cusolverDnXgetrs reported an error");
|
||||
return X;
|
||||
}
|
||||
|
||||
/** Solve op(A) * X = B with device-resident RHS. Fully async. */
|
||||
DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B, TransposeMode mode = NoTranspose) const {
|
||||
eigen_assert(d_B.rows() == n_);
|
||||
d_B.waitReady(stream_);
|
||||
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
|
||||
return solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_x_ptr, d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
|
||||
});
|
||||
}
|
||||
|
||||
// ---- Accessors -----------------------------------------------------------
|
||||
|
||||
/** Lazily synchronizes the stream on first call after compute(). */
|
||||
ComputationInfo info() const {
|
||||
const_cast<GpuLU*>(this)->sync_info();
|
||||
return info_;
|
||||
}
|
||||
Index rows() const { return n_; }
|
||||
Index cols() const { return n_; }
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cusolverDnHandle_t handle_ = nullptr;
|
||||
internal::CusolverParams params_; // cuSOLVER params (created once, reused)
|
||||
internal::DeviceBuffer d_lu_; // LU factors on device (grows, never shrinks)
|
||||
size_t lu_alloc_size_ = 0; // current d_lu_ allocation size
|
||||
internal::DeviceBuffer d_ipiv_; // pivot indices (int64_t) on device
|
||||
internal::DeviceBuffer d_scratch_; // combined workspace + info word (grows, never shrinks)
|
||||
size_t scratch_size_ = 0; // current scratch allocation size
|
||||
std::vector<char> h_workspace_; // host workspace (kept alive until next compute)
|
||||
Index n_ = 0;
|
||||
int64_t lda_ = 0;
|
||||
ComputationInfo info_ = InvalidInput;
|
||||
int info_word_ = 0; // host-side target for async info download
|
||||
bool info_synced_ = true; // has the stream been synced for info?
|
||||
|
||||
bool begin_compute(Index rows) {
|
||||
n_ = rows;
|
||||
info_ = InvalidInput;
|
||||
if (n_ == 0) {
|
||||
info_ = Success;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t matrixBytes() const { return matrixBytes(static_cast<int64_t>(n_), lda_); }
|
||||
|
||||
static size_t matrixBytes(int64_t cols, int64_t outer_stride) {
|
||||
return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
|
||||
}
|
||||
|
||||
void allocate_lu_storage() {
|
||||
size_t needed = matrixBytes();
|
||||
if (needed > lu_alloc_size_) {
|
||||
d_lu_ = internal::DeviceBuffer(needed);
|
||||
lu_alloc_size_ = needed;
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure d_scratch_ is at least `workspace_bytes + sizeof(int)`.
|
||||
// Layout: [workspace (workspace_bytes) | info_word (sizeof(int))].
|
||||
// Ensure d_scratch_ can hold workspace_bytes + an aligned info word.
|
||||
// Grows but never shrinks. Syncs the stream before reallocating to
|
||||
// avoid freeing memory that async kernels may still be using.
|
||||
void ensure_scratch(size_t workspace_bytes) {
|
||||
constexpr size_t kAlign = 16;
|
||||
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||
size_t needed = workspace_bytes + sizeof(int);
|
||||
if (needed > scratch_size_) {
|
||||
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
d_scratch_ = internal::DeviceBuffer(needed);
|
||||
scratch_size_ = needed;
|
||||
}
|
||||
}
|
||||
|
||||
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||
int* scratch_info() const {
|
||||
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||
}
|
||||
|
||||
template <typename CopyRhs>
|
||||
DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, TransposeMode mode, CopyRhs&& copy_rhs) const {
|
||||
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||
const cublasOperation_t trans = to_cublas_op(mode);
|
||||
|
||||
Scalar* d_x_ptr = nullptr;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_x_ptr), matrixBytes(nrhs, ldb)));
|
||||
copy_rhs(d_x_ptr);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(handle_, params_.p, trans, static_cast<int64_t>(n_), nrhs, dtype, d_lu_.ptr,
|
||||
lda_, static_cast<const int64_t*>(d_ipiv_.ptr), dtype, d_x_ptr, ldb,
|
||||
scratch_info()));
|
||||
|
||||
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs), static_cast<Index>(ldb));
|
||||
result.recordReady(stream_);
|
||||
return result;
|
||||
}
|
||||
|
||||
void init_context() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||
ensure_scratch(0); // allocate at least the info word
|
||||
}
|
||||
|
||||
void sync_info() {
|
||||
if (!info_synced_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||
info_synced_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Run cusolverDnXgetrf on d_lu_ (already on device). Allocates d_ipiv_.
|
||||
// Enqueues factorization + async info download. Does NOT sync.
|
||||
// Workspaces are stored as members to ensure they outlive the async kernels.
|
||||
void factorize() {
|
||||
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||
const size_t ipiv_bytes = static_cast<size_t>(n_) * sizeof(int64_t);
|
||||
|
||||
info_synced_ = false;
|
||||
info_ = InvalidInput;
|
||||
|
||||
d_ipiv_ = internal::DeviceBuffer(ipiv_bytes);
|
||||
|
||||
size_t dev_ws_bytes = 0, host_ws_bytes = 0;
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(handle_, params_.p, static_cast<int64_t>(n_),
|
||||
static_cast<int64_t>(n_), dtype, d_lu_.ptr, lda_, dtype,
|
||||
&dev_ws_bytes, &host_ws_bytes));
|
||||
|
||||
ensure_scratch(dev_ws_bytes);
|
||||
h_workspace_.resize(host_ws_bytes);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(
|
||||
cusolverDnXgetrf(handle_, params_.p, static_cast<int64_t>(n_), static_cast<int64_t>(n_), dtype, d_lu_.ptr, lda_,
|
||||
static_cast<int64_t*>(d_ipiv_.ptr), dtype, scratch_workspace(), dev_ws_bytes,
|
||||
host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info()));
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||
}
|
||||
|
||||
static cublasOperation_t to_cublas_op(TransposeMode mode) {
|
||||
switch (mode) {
|
||||
case Transpose:
|
||||
return CUBLAS_OP_T;
|
||||
case ConjugateTranspose:
|
||||
return CUBLAS_OP_C;
|
||||
default:
|
||||
return CUBLAS_OP_N;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_LU_H
|
||||
101
Eigen/src/GPU/GpuSupport.h
Normal file
101
Eigen/src/GPU/GpuSupport.h
Normal file
@@ -0,0 +1,101 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Generic CUDA runtime support shared across all GPU library integrations
|
||||
// (cuSOLVER, cuBLAS, cuDSS, etc.):
|
||||
// - Error-checking macros
|
||||
// - RAII device buffer
|
||||
//
|
||||
// Only depends on <cuda_runtime.h>. No NVIDIA library headers.
|
||||
|
||||
#ifndef EIGEN_GPU_SUPPORT_H
|
||||
#define EIGEN_GPU_SUPPORT_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// ---- Error-checking macros --------------------------------------------------
|
||||
// These abort (via eigen_assert) on failure. Not for use in destructors.
|
||||
|
||||
#define EIGEN_CUDA_RUNTIME_CHECK(expr) \
|
||||
do { \
|
||||
cudaError_t _e = (expr); \
|
||||
eigen_assert(_e == cudaSuccess && "CUDA runtime call failed"); \
|
||||
} while (0)
|
||||
|
||||
// ---- RAII: device buffer ----------------------------------------------------
|
||||
|
||||
struct DeviceBuffer {
|
||||
void* ptr = nullptr;
|
||||
|
||||
DeviceBuffer() = default;
|
||||
|
||||
explicit DeviceBuffer(size_t bytes) {
|
||||
if (bytes > 0) EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes));
|
||||
}
|
||||
|
||||
~DeviceBuffer() {
|
||||
if (ptr) (void)cudaFree(ptr); // destructor: ignore errors
|
||||
}
|
||||
|
||||
// Move-only.
|
||||
DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr) { o.ptr = nullptr; }
|
||||
DeviceBuffer& operator=(DeviceBuffer&& o) noexcept {
|
||||
if (this != &o) {
|
||||
if (ptr) (void)cudaFree(ptr);
|
||||
ptr = o.ptr;
|
||||
o.ptr = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
DeviceBuffer(const DeviceBuffer&) = delete;
|
||||
DeviceBuffer& operator=(const DeviceBuffer&) = delete;
|
||||
|
||||
// Adopt an existing device pointer. Caller relinquishes ownership.
|
||||
static DeviceBuffer adopt(void* p) {
|
||||
DeviceBuffer b;
|
||||
b.ptr = p;
|
||||
return b;
|
||||
}
|
||||
};
|
||||
|
||||
// ---- Scalar → cudaDataType_t ------------------------------------------------
|
||||
// Shared by cuBLAS and cuSOLVER. cudaDataType_t is defined in library_types.h
|
||||
// which is included transitively by cuda_runtime.h.
|
||||
|
||||
template <typename Scalar>
|
||||
struct cuda_data_type;
|
||||
|
||||
template <>
|
||||
struct cuda_data_type<float> {
|
||||
static constexpr cudaDataType_t value = CUDA_R_32F;
|
||||
};
|
||||
template <>
|
||||
struct cuda_data_type<double> {
|
||||
static constexpr cudaDataType_t value = CUDA_R_64F;
|
||||
};
|
||||
template <>
|
||||
struct cuda_data_type<std::complex<float>> {
|
||||
static constexpr cudaDataType_t value = CUDA_C_32F;
|
||||
};
|
||||
template <>
|
||||
struct cuda_data_type<std::complex<double>> {
|
||||
static constexpr cudaDataType_t value = CUDA_C_64F;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_SUPPORT_H
|
||||
3
Eigen/src/GPU/InternalHeaderCheck.h
Normal file
3
Eigen/src/GPU/InternalHeaderCheck.h
Normal file
@@ -0,0 +1,3 @@
|
||||
#ifndef EIGEN_GPU_MODULE_H
|
||||
#error "Please include Eigen/GPU instead of including headers inside the src/GPU directory directly."
|
||||
#endif
|
||||
318
Eigen/src/GPU/README.md
Normal file
318
Eigen/src/GPU/README.md
Normal file
@@ -0,0 +1,318 @@
|
||||
# Eigen GPU Module (`Eigen/GPU`)
|
||||
|
||||
GPU-accelerated dense linear algebra for Eigen users, dispatching to NVIDIA
|
||||
CUDA libraries (cuBLAS, cuSOLVER). Requires CUDA 11.4+. Header-only (link
|
||||
against CUDA runtime, cuBLAS, and cuSOLVER).
|
||||
|
||||
## Why this module
|
||||
|
||||
Eigen is the linear algebra foundation for a large ecosystem of C++ projects
|
||||
in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP,
|
||||
Open3D), scientific computing (Ceres, Stan), and beyond. Many of these
|
||||
projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations
|
||||
without dropping down to raw CUDA library APIs. Third-party projects like
|
||||
[EigenCuda](https://github.com/NLESC-JCER/EigenCuda) and
|
||||
[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically to fill
|
||||
this gap, and downstream projects like
|
||||
[Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
|
||||
[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
|
||||
GPU-accelerated solvers through Eigen.
|
||||
|
||||
The `Eigen/GPU` module aims to close this gap: Existing Eigen users should be
|
||||
able to move performance-critical dense linear algebra to the GPU with minimal
|
||||
code changes and without learning CUDA library APIs directly.
|
||||
|
||||
## Design philosophy
|
||||
|
||||
**CPU and GPU coexist.** There is no global compile-time switch that replaces
|
||||
CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers
|
||||
explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>` -- and both coexist in
|
||||
the same binary. This also lets users keep the factored matrix on device across
|
||||
multiple solves, something impossible with compile-time replacement.
|
||||
|
||||
**Familiar syntax.** GPU operations use the same expression patterns as CPU
|
||||
Eigen. Here is a side-by-side comparison:
|
||||
|
||||
```cpp
|
||||
// ---- CPU (Eigen) ---- // ---- GPU (Eigen/GPU) ----
|
||||
#include <Eigen/Dense> #define EIGEN_USE_GPU
|
||||
#include <Eigen/GPU>
|
||||
|
||||
MatrixXd A = ...; auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||
MatrixXd B = ...; auto d_B = DeviceMatrix<double>::fromHost(B);
|
||||
|
||||
MatrixXd C = A * B; DeviceMatrix<double> d_C = d_A * d_B;
|
||||
MatrixXd X = A.llt().solve(B); DeviceMatrix<double> d_X = d_A.llt().solve(d_B);
|
||||
|
||||
MatrixXd X = d_X.toHost();
|
||||
```
|
||||
|
||||
The GPU version reads like CPU Eigen with explicit upload/download.
|
||||
`operator*` dispatches to cuBLAS GEMM, `.llt().solve()` dispatches to
|
||||
cuSOLVER potrf + potrs. Unsupported expressions are compile errors.
|
||||
|
||||
**Explicit over implicit.** Host-device transfers, stream management, and
|
||||
library handle lifetimes are visible in the API. There are no hidden
|
||||
allocations or synchronizations except where documented (e.g., `toHost()` must
|
||||
synchronize to deliver data to the host).
|
||||
|
||||
## Key concepts
|
||||
|
||||
### `DeviceMatrix<Scalar>`
|
||||
|
||||
A typed RAII wrapper for a dense column-major matrix in GPU device memory.
|
||||
This is the GPU counterpart of Eigen's `MatrixX<Scalar>`. A vector is simply
|
||||
a `DeviceMatrix` with one column.
|
||||
|
||||
```cpp
|
||||
// Upload from host
|
||||
auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||
|
||||
// Allocate uninitialized
|
||||
DeviceMatrix<double> d_C(m, n);
|
||||
|
||||
// Download to host
|
||||
MatrixXd C = d_C.toHost();
|
||||
|
||||
// Async download (returns a future)
|
||||
auto transfer = d_C.toHostAsync();
|
||||
// ... do other work ...
|
||||
MatrixXd C = transfer.get();
|
||||
```
|
||||
|
||||
`DeviceMatrix` supports expression methods that mirror Eigen's API:
|
||||
`adjoint()`, `transpose()`, `triangularView<UpLo>()`,
|
||||
`selfadjointView<UpLo>()`, `llt()`, `lu()`. These return lightweight
|
||||
expression objects that are evaluated when assigned.
|
||||
|
||||
### `GpuContext`
|
||||
|
||||
Every GPU operation needs a CUDA stream and library handles (cuBLAS,
|
||||
cuSOLVER). `GpuContext` bundles these together.
|
||||
|
||||
For simple usage, you don't need to create one -- a per-thread default context
|
||||
is created lazily on first use:
|
||||
|
||||
```cpp
|
||||
// These use the thread-local default context automatically
|
||||
d_C = d_A * d_B;
|
||||
d_X = d_A.llt().solve(d_B);
|
||||
```
|
||||
|
||||
For concurrent multi-stream execution, create explicit contexts:
|
||||
|
||||
```cpp
|
||||
GpuContext ctx1, ctx2;
|
||||
d_C1.device(ctx1) = d_A1 * d_B1; // runs on stream 1
|
||||
d_C2.device(ctx2) = d_A2 * d_B2; // runs on stream 2 (concurrently)
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Matrix operations (cuBLAS)
|
||||
|
||||
```cpp
|
||||
auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<double>::fromHost(B);
|
||||
|
||||
// GEMM: C = A * B, C = A^H * B, C = A * B^T, ...
|
||||
DeviceMatrix<double> d_C = d_A * d_B;
|
||||
d_C = d_A.adjoint() * d_B;
|
||||
d_C = d_A * d_B.transpose();
|
||||
|
||||
// Scaled and accumulated
|
||||
d_C += 2.0 * d_A * d_B; // alpha=2, beta=1
|
||||
d_C.device(ctx) -= d_A * d_B; // alpha=-1, beta=1 (requires explicit context)
|
||||
|
||||
// Triangular solve (TRSM)
|
||||
d_X = d_A.triangularView<Lower>().solve(d_B);
|
||||
|
||||
// Symmetric/Hermitian multiply (SYMM/HEMM)
|
||||
d_C = d_A.selfadjointView<Lower>() * d_B;
|
||||
|
||||
// Rank-k update (SYRK/HERK)
|
||||
d_C.selfadjointView<Lower>().rankUpdate(d_A); // C += A * A^H
|
||||
```
|
||||
|
||||
### Dense solvers (cuSOLVER)
|
||||
|
||||
**One-shot expression syntax** -- Convenient, re-factorizes each time:
|
||||
|
||||
```cpp
|
||||
// Cholesky solve (potrf + potrs)
|
||||
d_X = d_A.llt().solve(d_B);
|
||||
|
||||
// LU solve (getrf + getrs)
|
||||
d_Y = d_A.lu().solve(d_B);
|
||||
```
|
||||
|
||||
**Cached factorization** -- Factor once, solve many times:
|
||||
|
||||
```cpp
|
||||
GpuLLT<double> llt;
|
||||
llt.compute(d_A); // factorize (async)
|
||||
if (llt.info() != Success) { ... } // lazy sync on first info() call
|
||||
auto d_X1 = llt.solve(d_B1); // reuses factor (async)
|
||||
auto d_X2 = llt.solve(d_B2); // reuses factor (async)
|
||||
MatrixXd X2 = d_X2.toHost();
|
||||
|
||||
// LU with transpose solve
|
||||
GpuLU<double> lu;
|
||||
lu.compute(d_A);
|
||||
auto d_Y = lu.solve(d_B, GpuLU<double>::Transpose); // A^T Y = B
|
||||
```
|
||||
|
||||
The cached API keeps the factored matrix on device, avoiding redundant
|
||||
host-device transfers and re-factorizations.
|
||||
|
||||
### Stream control and async execution
|
||||
|
||||
Operations are asynchronous by default. The compute-solve chain runs without
|
||||
host synchronization until you need a result on the host:
|
||||
|
||||
```
|
||||
fromHost(A) --sync--> compute() --async--> solve() --async--> toHost()
|
||||
H2D potrf potrs D2H
|
||||
sync
|
||||
```
|
||||
|
||||
Mandatory sync points:
|
||||
- `fromHost()` -- Synchronizes to complete the upload before returning
|
||||
- `toHost()` / `HostTransfer::get()` -- Must deliver data to host
|
||||
- `info()` -- Must read the factorization status
|
||||
|
||||
**Cross-stream safety** is automatic. `DeviceMatrix` tracks write completion
|
||||
via CUDA events. When a matrix written on stream A is read on stream B, the
|
||||
module automatically inserts `cudaStreamWaitEvent`. Same-stream operations
|
||||
skip the wait (CUDA guarantees in-order execution within a stream).
|
||||
|
||||
## Reference
|
||||
|
||||
### Supported scalar types
|
||||
|
||||
`float`, `double`, `std::complex<float>`, `std::complex<double>`.
|
||||
|
||||
### Expression -> library call mapping
|
||||
|
||||
| DeviceMatrix expression | Library call | Parameters |
|
||||
|---|---|---|
|
||||
| `C = A * B` | `cublasGemmEx` | transA=N, transB=N, alpha=1, beta=0 |
|
||||
| `C = A.adjoint() * B` | `cublasGemmEx` | transA=C, transB=N |
|
||||
| `C = A.transpose() * B` | `cublasGemmEx` | transA=T, transB=N |
|
||||
| `C = A * B.adjoint()` | `cublasGemmEx` | transA=N, transB=C |
|
||||
| `C = A * B.transpose()` | `cublasGemmEx` | transA=N, transB=T |
|
||||
| `C = alpha * A * B` | `cublasGemmEx` | alpha from LHS |
|
||||
| `C = A * (alpha * B)` | `cublasGemmEx` | alpha from RHS |
|
||||
| `C += A * B` | `cublasGemmEx` | alpha=1, beta=1 |
|
||||
| `C.device(ctx) -= A * B` | `cublasGemmEx` | alpha=-1, beta=1 |
|
||||
| `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs |
|
||||
| `X = A.llt<Upper>().solve(B)` | same | uplo=Upper |
|
||||
| `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs |
|
||||
| `X = A.triangularView<L>().solve(B)` | `cublasXtrsm` | side=L, uplo, diag=NonUnit |
|
||||
| `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
|
||||
| `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
|
||||
|
||||
### `DeviceMatrix<Scalar>` API
|
||||
|
||||
| Method | Sync? | Description |
|
||||
|--------|-------|-------------|
|
||||
| `DeviceMatrix()` | -- | Empty (0x0) |
|
||||
| `DeviceMatrix(rows, cols)` | -- | Allocate uninitialized |
|
||||
| `fromHost(matrix, stream)` | yes | Upload from Eigen matrix |
|
||||
| `fromHostAsync(ptr, rows, cols, outerStride, stream)` | no | Async upload (caller manages lifetime) |
|
||||
| `toHost(stream)` | yes | Synchronous download |
|
||||
| `toHostAsync(stream)` | no | Returns `HostTransfer` future |
|
||||
| `clone(stream)` | no | Device-to-device deep copy |
|
||||
| `resize(rows, cols)` | -- | Discard contents, reallocate |
|
||||
| `data()` | -- | Raw device pointer |
|
||||
| `rows()`, `cols()` | -- | Dimensions |
|
||||
| `sizeInBytes()` | -- | Total device allocation size in bytes |
|
||||
| `empty()` | -- | True if 0x0 |
|
||||
| `adjoint()` | -- | Adjoint view (GEMM ConjTrans) |
|
||||
| `transpose()` | -- | Transpose view (GEMM Trans) |
|
||||
| `llt()` / `llt<UpLo>()` | -- | Cholesky expression builder |
|
||||
| `lu()` | -- | LU expression builder |
|
||||
| `triangularView<UpLo>()` | -- | Triangular view (TRSM) |
|
||||
| `selfadjointView<UpLo>()` | -- | Self-adjoint view (SYMM, rankUpdate) |
|
||||
| `device(ctx)` | -- | Assignment proxy bound to context |
|
||||
|
||||
### `GpuContext`
|
||||
|
||||
Unified GPU execution context owning a CUDA stream and library handles.
|
||||
|
||||
```cpp
|
||||
GpuContext() // Creates dedicated stream + handles
|
||||
static GpuContext& threadLocal() // Per-thread default (lazy-created)
|
||||
|
||||
cudaStream_t stream()
|
||||
cublasHandle_t cublasHandle()
|
||||
cusolverDnHandle_t cusolverHandle()
|
||||
```
|
||||
|
||||
Non-copyable, non-movable (owns library handles).
|
||||
|
||||
### `GpuLLT<Scalar, UpLo>` API
|
||||
|
||||
GPU dense Cholesky (LL^T) via cuSOLVER. Caches factor on device.
|
||||
|
||||
| Method | Sync? | Description |
|
||||
|--------|-------|-------------|
|
||||
| `GpuLLT(A)` | deferred | Construct and factorize from host matrix |
|
||||
| `compute(host_matrix)` | deferred | Upload and factorize |
|
||||
| `compute(DeviceMatrix)` | deferred | D2D copy and factorize |
|
||||
| `compute(DeviceMatrix&&)` | deferred | Move-adopt and factorize (no copy) |
|
||||
| `solve(host_matrix)` | yes | Solve, return host matrix |
|
||||
| `solve(DeviceMatrix)` | no | Solve, return `DeviceMatrix` (async) |
|
||||
| `info()` | lazy | Syncs stream on first call, returns `Success` or `NumericalIssue` |
|
||||
|
||||
### `GpuLU<Scalar>` API
|
||||
|
||||
GPU dense partial-pivoting LU via cuSOLVER. Same pattern as `GpuLLT`, plus
|
||||
`TransposeMode` parameter on `solve()` (`NoTranspose`, `Transpose`,
|
||||
`ConjugateTranspose`).
|
||||
|
||||
### `HostTransfer<Scalar>` API
|
||||
|
||||
Future for async device-to-host transfer.
|
||||
|
||||
| Method | Description |
|
||||
|--------|-------------|
|
||||
| `get()` | Block until transfer completes, return host matrix reference. Idempotent. |
|
||||
| `ready()` | Non-blocking poll |
|
||||
|
||||
### Aliasing
|
||||
|
||||
Unlike Eigen's `Matrix`, where omitting `.noalias()` triggers a copy to a
|
||||
temporary, DeviceMatrix dispatches directly to NVIDIA library calls which have
|
||||
no built-in aliasing protection. All operations are implicitly noalias.
|
||||
The caller must ensure operands don't alias the destination for GEMM and TRSM
|
||||
(debug asserts catch violations).
|
||||
|
||||
## File layout
|
||||
|
||||
| File | Depends on | Contents |
|
||||
|------|-----------|----------|
|
||||
| `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `cuda_data_type<>` |
|
||||
| `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` |
|
||||
| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM expression wrappers |
|
||||
| `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers |
|
||||
| `DeviceSolverExpr.h` | `DeviceMatrix.h` | Solver expression wrappers (LLT, LU) |
|
||||
| `DeviceDispatch.h` | all above | All dispatch functions + `DeviceAssignment` |
|
||||
| `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `GpuContext` |
|
||||
| `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>` | cuBLAS error macro, op/compute type maps |
|
||||
| `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
|
||||
| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
|
||||
| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
|
||||
|
||||
## Building and testing
|
||||
|
||||
```bash
|
||||
cmake -G Ninja -B build -S . \
|
||||
-DEIGEN_TEST_CUDA=ON \
|
||||
-DEIGEN_CUDA_COMPUTE_ARCH="70" \
|
||||
-DEIGEN_TEST_CUBLAS=ON \
|
||||
-DEIGEN_TEST_CUSOLVER=ON
|
||||
|
||||
cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu gpu_device_matrix
|
||||
ctest --test-dir build -R "gpu_cublas|gpu_cusolver|gpu_device" --output-on-failure
|
||||
```
|
||||
@@ -43,3 +43,10 @@ add_subdirectory(Householder)
|
||||
add_subdirectory(Solvers)
|
||||
add_subdirectory(Tuning)
|
||||
add_subdirectory(BLAS)
|
||||
|
||||
# GPU benchmarks have their own CMake project (needs CUDAToolkit).
|
||||
# They can also be built standalone: cmake -B build -S benchmarks/GPU
|
||||
find_package(CUDAToolkit QUIET)
|
||||
if(CUDAToolkit_FOUND)
|
||||
add_subdirectory(GPU)
|
||||
endif()
|
||||
|
||||
53
benchmarks/GPU/CMakeLists.txt
Normal file
53
benchmarks/GPU/CMakeLists.txt
Normal file
@@ -0,0 +1,53 @@
|
||||
# GPU benchmarks require CUDA runtime + cuSOLVER.
|
||||
# Build separately from the main benchmark tree since they need CUDA toolchain.
|
||||
#
|
||||
# Usage:
|
||||
# cmake -G Ninja -B build-bench-gpu -S benchmarks/GPU \
|
||||
# -DCMAKE_CUDA_ARCHITECTURES=89
|
||||
# cmake --build build-bench-gpu
|
||||
#
|
||||
# Profiling:
|
||||
# nsys profile --trace=cuda ./build-bench-gpu/bench_gpu_solvers
|
||||
# ncu --set full -o profile ./build-bench-gpu/bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
|
||||
|
||||
cmake_minimum_required(VERSION 3.18)
|
||||
project(EigenGpuBenchmarks CXX)
|
||||
|
||||
find_package(benchmark REQUIRED)
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
|
||||
set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
|
||||
|
||||
function(eigen_add_gpu_benchmark name source)
|
||||
cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN})
|
||||
if(NOT IS_ABSOLUTE "${source}")
|
||||
set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}")
|
||||
endif()
|
||||
add_executable(${name} ${source})
|
||||
target_include_directories(${name} PRIVATE
|
||||
${EIGEN_SOURCE_DIR}
|
||||
${CUDAToolkit_INCLUDE_DIRS})
|
||||
target_link_libraries(${name} PRIVATE
|
||||
benchmark::benchmark benchmark::benchmark_main
|
||||
CUDA::cudart CUDA::cusolver CUDA::cublas)
|
||||
if(BENCH_LIBRARIES)
|
||||
target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
|
||||
endif()
|
||||
target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
|
||||
target_compile_definitions(${name} PRIVATE EIGEN_USE_GPU)
|
||||
if(BENCH_DEFINITIONS)
|
||||
target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
# Solver benchmarks: LLT/LU compute + solve, host vs device paths, CPU baselines.
|
||||
eigen_add_gpu_benchmark(bench_gpu_solvers bench_gpu_solvers.cpp)
|
||||
eigen_add_gpu_benchmark(bench_gpu_solvers_float bench_gpu_solvers.cpp DEFINITIONS SCALAR=float)
|
||||
|
||||
# Chaining benchmarks: async pipeline efficiency, host-roundtrip vs device chain.
|
||||
eigen_add_gpu_benchmark(bench_gpu_chaining bench_gpu_chaining.cpp)
|
||||
eigen_add_gpu_benchmark(bench_gpu_chaining_float bench_gpu_chaining.cpp DEFINITIONS SCALAR=float)
|
||||
|
||||
# Batching benchmarks: multi-stream concurrency for many small systems.
|
||||
eigen_add_gpu_benchmark(bench_gpu_batching bench_gpu_batching.cpp)
|
||||
eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITIONS SCALAR=float)
|
||||
268
benchmarks/GPU/bench_gpu_batching.cpp
Normal file
268
benchmarks/GPU/bench_gpu_batching.cpp
Normal file
@@ -0,0 +1,268 @@
|
||||
// GPU batching benchmarks: multi-stream concurrency for many small solves.
|
||||
//
|
||||
// Each GpuLLT/GpuLU owns its own CUDA stream. This benchmark measures how
|
||||
// well multiple solver instances overlap on the GPU, which is critical for
|
||||
// workloads like robotics (many small systems) and SLAM (batched poses).
|
||||
//
|
||||
// Compares:
|
||||
// 1. Sequential: one solver handles all systems one by one
|
||||
// 2. Batched: N solvers on N streams, all launched before any sync
|
||||
// 3. CPU baseline: Eigen LLT on host
|
||||
//
|
||||
// For Nsight Systems: batched mode should show overlapping kernels on
|
||||
// different streams in the timeline view.
|
||||
//
|
||||
// nsys profile --trace=cuda ./bench_gpu_batching
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <Eigen/Cholesky>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
#ifndef SCALAR
|
||||
#define SCALAR double
|
||||
#endif
|
||||
|
||||
using Scalar = SCALAR;
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
|
||||
static Mat make_spd(Index n) {
|
||||
Mat M = Mat::Random(n, n);
|
||||
return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
|
||||
}
|
||||
|
||||
static void cuda_warmup() {
|
||||
static bool done = false;
|
||||
if (!done) {
|
||||
void* p;
|
||||
cudaMalloc(&p, 1);
|
||||
cudaFree(p);
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Sequential: one solver, N systems solved one after another
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Batch_Sequential(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int batch_size = static_cast<int>(state.range(1));
|
||||
|
||||
// Pre-generate all SPD matrices and RHS vectors.
|
||||
std::vector<Mat> As(batch_size);
|
||||
std::vector<Mat> Bs(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
As[i] = make_spd(n);
|
||||
Bs[i] = Mat::Random(n, 1);
|
||||
}
|
||||
|
||||
GpuLLT<Scalar> llt;
|
||||
|
||||
for (auto _ : state) {
|
||||
std::vector<Mat> results(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
llt.compute(As[i]);
|
||||
results[i] = llt.solve(Bs[i]);
|
||||
}
|
||||
benchmark::DoNotOptimize(results.back().data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["batch"] = batch_size;
|
||||
state.counters["total_solves"] = batch_size;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Sequential with DeviceMatrix (avoid re-upload of A each iteration)
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Batch_Sequential_Device(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int batch_size = static_cast<int>(state.range(1));
|
||||
|
||||
std::vector<Mat> As(batch_size);
|
||||
std::vector<Mat> Bs(batch_size);
|
||||
std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
|
||||
std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
As[i] = make_spd(n);
|
||||
Bs[i] = Mat::Random(n, 1);
|
||||
d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
|
||||
d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
|
||||
}
|
||||
|
||||
GpuLLT<Scalar> llt;
|
||||
|
||||
for (auto _ : state) {
|
||||
std::vector<Mat> results(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
llt.compute(d_As[i]);
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_Bs[i]);
|
||||
results[i] = d_X.toHost();
|
||||
}
|
||||
benchmark::DoNotOptimize(results.back().data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["batch"] = batch_size;
|
||||
state.counters["total_solves"] = batch_size;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Batched: N solvers on N streams, overlapping execution
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Batch_MultiStream(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int batch_size = static_cast<int>(state.range(1));
|
||||
|
||||
std::vector<Mat> As(batch_size);
|
||||
std::vector<Mat> Bs(batch_size);
|
||||
std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
|
||||
std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
As[i] = make_spd(n);
|
||||
Bs[i] = Mat::Random(n, 1);
|
||||
d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
|
||||
d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
|
||||
}
|
||||
|
||||
// N solvers = N independent CUDA streams.
|
||||
std::vector<std::unique_ptr<GpuLLT<Scalar>>> solvers(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
solvers[i] = std::make_unique<GpuLLT<Scalar>>();
|
||||
}
|
||||
|
||||
for (auto _ : state) {
|
||||
// Phase 1: launch all factorizations (async, different streams).
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
solvers[i]->compute(d_As[i]);
|
||||
}
|
||||
|
||||
// Phase 2: launch all solves (async, different streams).
|
||||
std::vector<DeviceMatrix<Scalar>> d_Xs(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
d_Xs[i] = solvers[i]->solve(d_Bs[i]);
|
||||
}
|
||||
|
||||
// Phase 3: download all results.
|
||||
std::vector<Mat> results(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
results[i] = d_Xs[i].toHost();
|
||||
}
|
||||
benchmark::DoNotOptimize(results.back().data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["batch"] = batch_size;
|
||||
state.counters["streams"] = batch_size;
|
||||
state.counters["total_solves"] = batch_size;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Batched with async download (overlap D2H with computation)
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Batch_MultiStream_AsyncDownload(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int batch_size = static_cast<int>(state.range(1));
|
||||
|
||||
std::vector<Mat> As(batch_size);
|
||||
std::vector<Mat> Bs(batch_size);
|
||||
std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
|
||||
std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
As[i] = make_spd(n);
|
||||
Bs[i] = Mat::Random(n, 1);
|
||||
d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
|
||||
d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<GpuLLT<Scalar>>> solvers(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
solvers[i] = std::make_unique<GpuLLT<Scalar>>();
|
||||
}
|
||||
|
||||
for (auto _ : state) {
|
||||
// Launch all compute + solve.
|
||||
std::vector<DeviceMatrix<Scalar>> d_Xs(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
solvers[i]->compute(d_As[i]);
|
||||
d_Xs[i] = solvers[i]->solve(d_Bs[i]);
|
||||
}
|
||||
|
||||
// Enqueue all async downloads.
|
||||
std::vector<HostTransfer<Scalar>> transfers;
|
||||
transfers.reserve(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
transfers.push_back(d_Xs[i].toHostAsync());
|
||||
}
|
||||
|
||||
// Collect all results.
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
benchmark::DoNotOptimize(transfers[i].get().data());
|
||||
}
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["batch"] = batch_size;
|
||||
state.counters["streams"] = batch_size;
|
||||
state.counters["total_solves"] = batch_size;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// CPU baseline: Eigen LLT on host, sequential
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Batch_CPU(benchmark::State& state) {
|
||||
const Index n = state.range(0);
|
||||
const int batch_size = static_cast<int>(state.range(1));
|
||||
|
||||
std::vector<Mat> As(batch_size);
|
||||
std::vector<Mat> Bs(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
As[i] = make_spd(n);
|
||||
Bs[i] = Mat::Random(n, 1);
|
||||
}
|
||||
|
||||
for (auto _ : state) {
|
||||
std::vector<Mat> results(batch_size);
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
LLT<Mat> llt(As[i]);
|
||||
results[i] = llt.solve(Bs[i]);
|
||||
}
|
||||
benchmark::DoNotOptimize(results.back().data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["batch"] = batch_size;
|
||||
state.counters["total_solves"] = batch_size;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Registration
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
// clang-format off
|
||||
// Args: {matrix_size, batch_size}
|
||||
// Small matrices with large batches are the interesting case for multi-stream.
|
||||
BENCHMARK(BM_Batch_Sequential)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Batch_Sequential_Device)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Batch_CPU)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||
|
||||
// Also run larger sizes with moderate batching.
|
||||
BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||
// clang-format on
|
||||
216
benchmarks/GPU/bench_gpu_chaining.cpp
Normal file
216
benchmarks/GPU/bench_gpu_chaining.cpp
Normal file
@@ -0,0 +1,216 @@
|
||||
// GPU chaining benchmarks: measure async pipeline efficiency.
|
||||
//
|
||||
// Compares:
|
||||
// 1. Host round-trip per solve (baseline)
|
||||
// 2. DeviceMatrix chaining (no host round-trip between solves)
|
||||
// 3. Varying chain lengths (1, 2, 4, 8 consecutive solves)
|
||||
//
|
||||
// For Nsight Systems: look for gaps between kernel launches in the timeline.
|
||||
// Host round-trip creates visible idle gaps; chaining should show back-to-back kernels.
|
||||
//
|
||||
// nsys profile --trace=cuda,nvtx ./bench_gpu_chaining
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <Eigen/Cholesky>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
#ifndef SCALAR
|
||||
#define SCALAR double
|
||||
#endif
|
||||
|
||||
using Scalar = SCALAR;
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
|
||||
static Mat make_spd(Index n) {
|
||||
Mat M = Mat::Random(n, n);
|
||||
return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
|
||||
}
|
||||
|
||||
static void cuda_warmup() {
|
||||
static bool done = false;
|
||||
if (!done) {
|
||||
void* p;
|
||||
cudaMalloc(&p, 1);
|
||||
cudaFree(p);
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Baseline: host round-trip between every solve
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Chain_HostRoundtrip(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int chain_len = static_cast<int>(state.range(1));
|
||||
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, 1);
|
||||
GpuLLT<Scalar> llt(A);
|
||||
|
||||
for (auto _ : state) {
|
||||
Mat X = B;
|
||||
for (int i = 0; i < chain_len; ++i) {
|
||||
X = llt.solve(X); // host → device → host each time
|
||||
}
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["chain"] = chain_len;
|
||||
state.counters["solves/iter"] = chain_len;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// DeviceMatrix chaining: no host round-trip between solves
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Chain_Device(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int chain_len = static_cast<int>(state.range(1));
|
||||
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, 1);
|
||||
GpuLLT<Scalar> llt(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
for (auto _ : state) {
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
for (int i = 1; i < chain_len; ++i) {
|
||||
d_X = llt.solve(d_X); // device → device, fully async
|
||||
}
|
||||
Mat X = d_X.toHost(); // single sync at end
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["chain"] = chain_len;
|
||||
state.counters["solves/iter"] = chain_len;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// DeviceMatrix chaining with async download (overlap D2H with next iteration)
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Chain_DeviceAsync(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int chain_len = static_cast<int>(state.range(1));
|
||||
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, 1);
|
||||
GpuLLT<Scalar> llt(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
for (auto _ : state) {
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
for (int i = 1; i < chain_len; ++i) {
|
||||
d_X = llt.solve(d_X);
|
||||
}
|
||||
auto transfer = d_X.toHostAsync();
|
||||
Mat X = transfer.get();
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["chain"] = chain_len;
|
||||
state.counters["solves/iter"] = chain_len;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Pure GPU chain (no download — measures kernel-only throughput)
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_Chain_DeviceNoDownload(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int chain_len = static_cast<int>(state.range(1));
|
||||
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, 1);
|
||||
GpuLLT<Scalar> llt(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
for (auto _ : state) {
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
for (int i = 1; i < chain_len; ++i) {
|
||||
d_X = llt.solve(d_X);
|
||||
}
|
||||
cudaStreamSynchronize(llt.stream());
|
||||
benchmark::DoNotOptimize(d_X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["chain"] = chain_len;
|
||||
state.counters["solves/iter"] = chain_len;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Compute + solve chain (full pipeline: factorize, then chain solves)
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_FullPipeline_Host(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int chain_len = static_cast<int>(state.range(1));
|
||||
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, 1);
|
||||
|
||||
for (auto _ : state) {
|
||||
GpuLLT<Scalar> llt(A);
|
||||
Mat X = B;
|
||||
for (int i = 0; i < chain_len; ++i) {
|
||||
X = llt.solve(X);
|
||||
}
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["chain"] = chain_len;
|
||||
}
|
||||
|
||||
static void BM_FullPipeline_Device(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const int chain_len = static_cast<int>(state.range(1));
|
||||
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, 1);
|
||||
|
||||
for (auto _ : state) {
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
GpuLLT<Scalar> llt;
|
||||
llt.compute(d_A);
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
for (int i = 1; i < chain_len; ++i) {
|
||||
d_X = llt.solve(d_X);
|
||||
}
|
||||
Mat X = d_X.toHost();
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["chain"] = chain_len;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Registration
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
// clang-format off
|
||||
// Args: {matrix_size, chain_length}
|
||||
BENCHMARK(BM_Chain_HostRoundtrip)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Chain_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Chain_DeviceAsync)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_Chain_DeviceNoDownload)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||
|
||||
BENCHMARK(BM_FullPipeline_Host)->ArgsProduct({{256, 1024, 4096}, {1, 4}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_FullPipeline_Device)->ArgsProduct({{256, 1024, 4096}, {1, 4}})->Unit(benchmark::kMicrosecond);
|
||||
// clang-format on
|
||||
296
benchmarks/GPU/bench_gpu_solvers.cpp
Normal file
296
benchmarks/GPU/bench_gpu_solvers.cpp
Normal file
@@ -0,0 +1,296 @@
|
||||
// GPU solver benchmarks: GpuLLT and GpuLU compute + solve throughput.
|
||||
//
|
||||
// Measures factorization and solve performance for the host-matrix and
|
||||
// DeviceMatrix code paths across a range of matrix sizes.
|
||||
//
|
||||
// For Nsight Systems profiling:
|
||||
// nsys profile --trace=cuda,nvtx ./bench_gpu_solvers
|
||||
//
|
||||
// For Nsight Compute kernel analysis:
|
||||
// ncu --set full -o profile ./bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
|
||||
|
||||
#include <benchmark/benchmark.h>
|
||||
|
||||
#include <Eigen/Cholesky>
|
||||
#include <Eigen/GPU>
|
||||
#include <Eigen/LU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
#ifndef SCALAR
|
||||
#define SCALAR double
|
||||
#endif
|
||||
|
||||
using Scalar = SCALAR;
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static Mat make_spd(Index n) {
|
||||
Mat M = Mat::Random(n, n);
|
||||
return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
|
||||
}
|
||||
|
||||
// CUDA warm-up: ensure the GPU is initialized before timing.
|
||||
static void cuda_warmup() {
|
||||
static bool done = false;
|
||||
if (!done) {
|
||||
void* p;
|
||||
cudaMalloc(&p, 1);
|
||||
cudaFree(p);
|
||||
done = true;
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// GpuLLT benchmarks
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
// Factorize from host matrix (includes H2D upload).
|
||||
static void BM_GpuLLT_Compute_Host(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
Mat A = make_spd(n);
|
||||
GpuLLT<Scalar> llt;
|
||||
|
||||
for (auto _ : state) {
|
||||
llt.compute(A);
|
||||
if (llt.info() != Success) state.SkipWithError("factorization failed");
|
||||
}
|
||||
|
||||
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||||
state.counters["GFLOPS"] =
|
||||
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
state.counters["n"] = n;
|
||||
}
|
||||
|
||||
// Factorize from DeviceMatrix (D2D copy path).
|
||||
static void BM_GpuLLT_Compute_Device(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
Mat A = make_spd(n);
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
GpuLLT<Scalar> llt;
|
||||
|
||||
for (auto _ : state) {
|
||||
llt.compute(d_A);
|
||||
if (llt.info() != Success) state.SkipWithError("factorization failed");
|
||||
}
|
||||
|
||||
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||||
state.counters["GFLOPS"] =
|
||||
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
state.counters["n"] = n;
|
||||
}
|
||||
|
||||
// Factorize from DeviceMatrix (move path, no copy).
|
||||
static void BM_GpuLLT_Compute_DeviceMove(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
Mat A = make_spd(n);
|
||||
GpuLLT<Scalar> llt;
|
||||
|
||||
for (auto _ : state) {
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
llt.compute(std::move(d_A));
|
||||
if (llt.info() != Success) state.SkipWithError("factorization failed");
|
||||
}
|
||||
|
||||
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||||
state.counters["GFLOPS"] =
|
||||
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
state.counters["n"] = n;
|
||||
}
|
||||
|
||||
// Solve from host matrix (H2D + potrs + D2H).
|
||||
static void BM_GpuLLT_Solve_Host(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const Index nrhs = state.range(1);
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
GpuLLT<Scalar> llt(A);
|
||||
|
||||
for (auto _ : state) {
|
||||
Mat X = llt.solve(B);
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["nrhs"] = nrhs;
|
||||
}
|
||||
|
||||
// Solve from DeviceMatrix (D2D + potrs, async, toHost at end).
|
||||
static void BM_GpuLLT_Solve_Device(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const Index nrhs = state.range(1);
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
GpuLLT<Scalar> llt(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
for (auto _ : state) {
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
Mat X = d_X.toHost();
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["nrhs"] = nrhs;
|
||||
}
|
||||
|
||||
// Solve staying entirely on device (no toHost — measures pure GPU time).
|
||||
static void BM_GpuLLT_Solve_DeviceOnly(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const Index nrhs = state.range(1);
|
||||
Mat A = make_spd(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
GpuLLT<Scalar> llt(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
for (auto _ : state) {
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
// Force completion without D2H transfer.
|
||||
cudaStreamSynchronize(llt.stream());
|
||||
benchmark::DoNotOptimize(d_X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["nrhs"] = nrhs;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// GpuLU benchmarks
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_GpuLU_Compute_Host(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
Mat A = Mat::Random(n, n);
|
||||
GpuLU<Scalar> lu;
|
||||
|
||||
for (auto _ : state) {
|
||||
lu.compute(A);
|
||||
if (lu.info() != Success) state.SkipWithError("factorization failed");
|
||||
}
|
||||
|
||||
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
|
||||
state.counters["GFLOPS"] =
|
||||
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
state.counters["n"] = n;
|
||||
}
|
||||
|
||||
static void BM_GpuLU_Compute_Device(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
Mat A = Mat::Random(n, n);
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
GpuLU<Scalar> lu;
|
||||
|
||||
for (auto _ : state) {
|
||||
lu.compute(d_A);
|
||||
if (lu.info() != Success) state.SkipWithError("factorization failed");
|
||||
}
|
||||
|
||||
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
|
||||
state.counters["GFLOPS"] =
|
||||
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
state.counters["n"] = n;
|
||||
}
|
||||
|
||||
static void BM_GpuLU_Solve_Host(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const Index nrhs = state.range(1);
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
GpuLU<Scalar> lu(A);
|
||||
|
||||
for (auto _ : state) {
|
||||
Mat X = lu.solve(B);
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["nrhs"] = nrhs;
|
||||
}
|
||||
|
||||
static void BM_GpuLU_Solve_Device(benchmark::State& state) {
|
||||
cuda_warmup();
|
||||
const Index n = state.range(0);
|
||||
const Index nrhs = state.range(1);
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
GpuLU<Scalar> lu(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
for (auto _ : state) {
|
||||
DeviceMatrix<Scalar> d_X = lu.solve(d_B);
|
||||
Mat X = d_X.toHost();
|
||||
benchmark::DoNotOptimize(X.data());
|
||||
}
|
||||
|
||||
state.counters["n"] = n;
|
||||
state.counters["nrhs"] = nrhs;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// CPU baselines for comparison
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
static void BM_CpuLLT_Compute(benchmark::State& state) {
|
||||
const Index n = state.range(0);
|
||||
Mat A = make_spd(n);
|
||||
LLT<Mat> llt;
|
||||
|
||||
for (auto _ : state) {
|
||||
llt.compute(A);
|
||||
benchmark::DoNotOptimize(llt.matrixLLT().data());
|
||||
}
|
||||
|
||||
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||||
state.counters["GFLOPS"] =
|
||||
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
state.counters["n"] = n;
|
||||
}
|
||||
|
||||
static void BM_CpuLU_Compute(benchmark::State& state) {
|
||||
const Index n = state.range(0);
|
||||
Mat A = Mat::Random(n, n);
|
||||
PartialPivLU<Mat> lu;
|
||||
|
||||
for (auto _ : state) {
|
||||
lu.compute(A);
|
||||
benchmark::DoNotOptimize(lu.matrixLU().data());
|
||||
}
|
||||
|
||||
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
|
||||
state.counters["GFLOPS"] =
|
||||
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||
state.counters["n"] = n;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Registration
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
// clang-format off
|
||||
BENCHMARK(BM_GpuLLT_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLLT_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLLT_Compute_DeviceMove)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLLT_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLLT_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLLT_Solve_DeviceOnly)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||
|
||||
BENCHMARK(BM_GpuLU_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLU_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLU_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_GpuLU_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||
|
||||
BENCHMARK(BM_CpuLLT_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||
BENCHMARK(BM_CpuLU_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||
// clang-format on
|
||||
@@ -211,20 +211,20 @@ build:linux:x86-64:nvhpc-26.1:default:unsupported:
|
||||
# Build on regular linux to limit GPU cost.
|
||||
- saas-linux-2xlarge-amd64
|
||||
|
||||
# GCC-10, CUDA-12.2
|
||||
build:linux:cuda-12.2:gcc-10:
|
||||
# GCC-11, CUDA-12.2
|
||||
build:linux:cuda-12.2:gcc-11:
|
||||
extends: .build:linux:cuda
|
||||
image: nvidia/cuda:12.2.0-devel-ubuntu20.04
|
||||
image: nvidia/cuda:12.2.0-devel-ubuntu22.04
|
||||
variables:
|
||||
EIGEN_CI_C_COMPILER: gcc-10
|
||||
EIGEN_CI_CXX_COMPILER: g++-10
|
||||
EIGEN_CI_C_COMPILER: gcc-11
|
||||
EIGEN_CI_CXX_COMPILER: g++-11
|
||||
|
||||
# Clang-12, CUDA-12.2
|
||||
build:linux:cuda-12.2:clang-12:
|
||||
extends: build:linux:cuda-12.2:gcc-10
|
||||
# Clang-14, CUDA-12.2
|
||||
build:linux:cuda-12.2:clang-14:
|
||||
extends: build:linux:cuda-12.2:gcc-11
|
||||
variables:
|
||||
EIGEN_CI_C_COMPILER: clang-12
|
||||
EIGEN_CI_CXX_COMPILER: clang++-12
|
||||
EIGEN_CI_C_COMPILER: clang-14
|
||||
EIGEN_CI_CXX_COMPILER: clang++-14
|
||||
EIGEN_CI_TEST_CUDA_CLANG: "on"
|
||||
|
||||
|
||||
|
||||
@@ -265,23 +265,23 @@ test:linux:x86-64:nvhpc-26.1:default:unsupported:
|
||||
tags:
|
||||
- saas-linux-medium-amd64-gpu-standard
|
||||
|
||||
# GCC-10, CUDA-12.2
|
||||
test:linux:cuda-12.2:gcc-10:
|
||||
# GCC-11, CUDA-12.2
|
||||
test:linux:cuda-12.2:gcc-11:
|
||||
extends: .test:linux:cuda
|
||||
image: nvidia/cuda:12.2.0-devel-ubuntu20.04
|
||||
needs: [ build:linux:cuda-12.2:gcc-10 ]
|
||||
image: nvidia/cuda:12.2.0-devel-ubuntu22.04
|
||||
needs: [ build:linux:cuda-12.2:gcc-11 ]
|
||||
variables:
|
||||
EIGEN_CI_CXX_COMPILER: g++-10
|
||||
EIGEN_CI_CC_COMPILER: gcc-10
|
||||
EIGEN_CI_CXX_COMPILER: g++-11
|
||||
EIGEN_CI_CC_COMPILER: gcc-11
|
||||
|
||||
# Clang-12, CUDA-12.2
|
||||
test:linux:cuda-12.2:clang-12:
|
||||
# Clang-14, CUDA-12.2
|
||||
test:linux:cuda-12.2:clang-14:
|
||||
extends: .test:linux:cuda
|
||||
image: nvidia/cuda:12.2.0-devel-ubuntu20.04
|
||||
needs: [ build:linux:cuda-12.2:clang-12 ]
|
||||
image: nvidia/cuda:12.2.0-devel-ubuntu22.04
|
||||
needs: [ build:linux:cuda-12.2:clang-14 ]
|
||||
variables:
|
||||
EIGEN_CI_CXX_COMPILER: clang++-12
|
||||
EIGEN_CI_CC_COMPILER: clang-12
|
||||
EIGEN_CI_CXX_COMPILER: clang++-14
|
||||
EIGEN_CI_CC_COMPILER: clang-14
|
||||
|
||||
|
||||
##### arm ######################################################################
|
||||
|
||||
@@ -479,6 +479,78 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
||||
|
||||
ei_add_test(gpu_example)
|
||||
ei_add_test(gpu_basic)
|
||||
ei_add_test(gpu_library_example "" "CUDA::cusolver")
|
||||
|
||||
# DeviceMatrix tests: only CUDA runtime, no NVIDIA libraries.
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
add_executable(gpu_device_matrix gpu_device_matrix.cpp)
|
||||
target_include_directories(gpu_device_matrix PRIVATE
|
||||
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||
target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart)
|
||||
target_compile_definitions(gpu_device_matrix PRIVATE
|
||||
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||
EIGEN_TEST_PART_ALL=1)
|
||||
add_test(NAME gpu_device_matrix COMMAND gpu_device_matrix)
|
||||
add_dependencies(buildtests gpu_device_matrix)
|
||||
add_dependencies(buildtests_gpu gpu_device_matrix)
|
||||
set_property(TEST gpu_device_matrix APPEND PROPERTY LABELS "Official;gpu")
|
||||
set_property(TEST gpu_device_matrix PROPERTY SKIP_RETURN_CODE 77)
|
||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||
|
||||
# Library-specific GPU tests (activated by later phases, OFF by default).
|
||||
# CUDAToolkit imported targets (CUDA::cublas, etc.) are available from
|
||||
# find_package(CUDAToolkit) above.
|
||||
option(EIGEN_TEST_CUBLAS "Test cuBLAS integration" OFF)
|
||||
if(EIGEN_TEST_CUBLAS AND TARGET CUDA::cublas)
|
||||
# cuBLAS tests are plain .cpp files (no device code), like cuSOLVER tests.
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
add_executable(gpu_cublas gpu_cublas.cpp)
|
||||
target_include_directories(gpu_cublas PRIVATE
|
||||
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||
target_link_libraries(gpu_cublas
|
||||
Eigen3::Eigen CUDA::cudart CUDA::cublas CUDA::cusolver)
|
||||
target_compile_definitions(gpu_cublas PRIVATE
|
||||
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||
EIGEN_TEST_PART_ALL=1)
|
||||
add_test(NAME gpu_cublas COMMAND gpu_cublas)
|
||||
add_dependencies(buildtests gpu_cublas)
|
||||
add_dependencies(buildtests_gpu gpu_cublas)
|
||||
set_property(TEST gpu_cublas APPEND PROPERTY LABELS "Official;gpu")
|
||||
set_property(TEST gpu_cublas PROPERTY SKIP_RETURN_CODE 77)
|
||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||
endif()
|
||||
|
||||
option(EIGEN_TEST_CUSOLVER "Test cuSOLVER integration" OFF)
|
||||
if(EIGEN_TEST_CUSOLVER AND TARGET CUDA::cusolver)
|
||||
# cuSOLVER tests are plain .cpp files: no device code, compiled by the host
|
||||
# compiler and linked against CUDA runtime + cuSOLVER. This avoids NVCC
|
||||
# instantiating Eigen's CPU packet operations for CUDA vector types.
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
foreach(_cusolver_test IN ITEMS gpu_cusolver_llt gpu_cusolver_lu)
|
||||
add_executable(${_cusolver_test} ${_cusolver_test}.cpp)
|
||||
target_include_directories(${_cusolver_test} PRIVATE
|
||||
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||
target_link_libraries(${_cusolver_test}
|
||||
Eigen3::Eigen CUDA::cudart CUDA::cusolver CUDA::cublas)
|
||||
target_compile_definitions(${_cusolver_test} PRIVATE
|
||||
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||
EIGEN_TEST_PART_ALL=1)
|
||||
add_test(NAME ${_cusolver_test} COMMAND "${_cusolver_test}")
|
||||
add_dependencies(buildtests ${_cusolver_test})
|
||||
add_dependencies(buildtests_gpu ${_cusolver_test})
|
||||
set_property(TEST ${_cusolver_test} APPEND PROPERTY LABELS "Official;gpu")
|
||||
set_property(TEST ${_cusolver_test} PROPERTY SKIP_RETURN_CODE 77)
|
||||
endforeach()
|
||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||
endif()
|
||||
|
||||
option(EIGEN_TEST_CUSPARSE "Test cuSPARSE integration" OFF)
|
||||
if(EIGEN_TEST_CUSPARSE AND TARGET CUDA::cusparse)
|
||||
ei_add_test(gpu_cusparse "" "CUDA::cusparse")
|
||||
endif()
|
||||
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
|
||||
|
||||
72
test/gpu_context.h
Normal file
72
test/gpu_context.h
Normal file
@@ -0,0 +1,72 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_TEST_GPU_CONTEXT_H
|
||||
#define EIGEN_TEST_GPU_CONTEXT_H
|
||||
|
||||
// RAII context for GPU tests that use NVIDIA library APIs (cuBLAS, cuSOLVER, etc.).
|
||||
// Owns a non-default CUDA stream. Library handles (cuBLAS, cuSOLVER, etc.) are added
|
||||
// here by each integration phase as needed; each handle is bound to the owned stream.
|
||||
//
|
||||
// Usage:
|
||||
// GpuContext ctx;
|
||||
// auto buf = gpu_copy_to_device(ctx.stream, A);
|
||||
// // ... call NVIDIA library APIs using ctx.stream / ctx.cusolver ...
|
||||
// ctx.synchronize();
|
||||
|
||||
#include "gpu_test_helper.h"
|
||||
|
||||
#ifdef EIGEN_USE_GPU
|
||||
#include <cusolverDn.h>
|
||||
|
||||
// Checks cuSOLVER return codes, aborts on failure.
|
||||
#define CUSOLVER_CHECK(expr) \
|
||||
do { \
|
||||
cusolverStatus_t _status = (expr); \
|
||||
if (_status != CUSOLVER_STATUS_SUCCESS) { \
|
||||
printf("cuSOLVER error %d at %s:%d\n", static_cast<int>(_status), __FILE__, __LINE__); \
|
||||
gpu_assert(false); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
struct GpuContext {
|
||||
cudaStream_t stream = nullptr;
|
||||
cusolverDnHandle_t cusolver = nullptr;
|
||||
|
||||
GpuContext() {
|
||||
GPU_CHECK(gpuGetDevice(&device_));
|
||||
GPU_CHECK(gpuGetDeviceProperties(&device_props_, device_));
|
||||
GPU_CHECK(cudaStreamCreate(&stream));
|
||||
CUSOLVER_CHECK(cusolverDnCreate(&cusolver));
|
||||
CUSOLVER_CHECK(cusolverDnSetStream(cusolver, stream));
|
||||
}
|
||||
|
||||
~GpuContext() {
|
||||
if (cusolver) CUSOLVER_CHECK(cusolverDnDestroy(cusolver));
|
||||
if (stream) GPU_CHECK(cudaStreamDestroy(stream));
|
||||
}
|
||||
|
||||
int device() const { return device_; }
|
||||
const gpuDeviceProp_t& deviceProperties() const { return device_props_; }
|
||||
|
||||
// Wait for all work submitted on this context's stream to complete.
|
||||
void synchronize() { GPU_CHECK(cudaStreamSynchronize(stream)); }
|
||||
|
||||
// Non-copyable, non-movable.
|
||||
GpuContext(const GpuContext&) = delete;
|
||||
GpuContext& operator=(const GpuContext&) = delete;
|
||||
|
||||
private:
|
||||
int device_ = 0;
|
||||
gpuDeviceProp_t device_props_;
|
||||
};
|
||||
|
||||
#endif // EIGEN_USE_GPU
|
||||
|
||||
#endif // EIGEN_TEST_GPU_CONTEXT_H
|
||||
728
test/gpu_cublas.cpp
Normal file
728
test/gpu_cublas.cpp
Normal file
@@ -0,0 +1,728 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for cuBLAS GEMM dispatch via DeviceMatrix expression syntax.
|
||||
// Covers: d_C = d_A * d_B, adjoint, transpose, scaled, +=, .device(ctx).
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// ---- Basic GEMM: C = A * B -------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_basic(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
// Expression: d_C = d_A * d_B
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B;
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM with adjoint: C = A^H * B ----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_adjoint_lhs(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(k, m); // A is k×m, A^H is m×k
|
||||
Mat B = Mat::Random(k, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A.adjoint() * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A.adjoint() * B;
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM with transpose: C = A * B^T --------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_transpose_rhs(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(n, k); // B is n×k, B^T is k×n
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A * d_B.transpose();
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B.transpose();
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM with scaled: C = alpha * A * B ------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_scaled(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
Scalar alpha = Scalar(2.5);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = alpha * d_A * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = alpha * A * B;
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM accumulate: C += A * B (beta=1) -----------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_accumulate(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
Mat C_init = Mat::Random(m, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
|
||||
|
||||
d_C += d_A * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = C_init + A * B;
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM accumulate into empty destination ---------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_accumulate_empty(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
|
||||
d_C += d_A * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B;
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM subtract: C -= A * B (beta=1, alpha=-1) --------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_subtract(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
Mat C_init = Mat::Random(m, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
|
||||
|
||||
GpuContext ctx;
|
||||
d_C.device(ctx) -= d_A * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = C_init - A * B;
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM subtract from empty destination -----------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_subtract_empty(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuContext ctx;
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C.device(ctx) -= d_A * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = -(A * B);
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM with scaled RHS: C = A * (alpha * B) -----------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_scaled_rhs(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
Scalar alpha = Scalar(3.0);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A * (alpha * d_B);
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * (alpha * B);
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM dimension mismatch must assert ------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_dimension_mismatch() {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
|
||||
Mat A = Mat::Random(8, 5);
|
||||
Mat B = Mat::Random(6, 7); // inner dimension mismatch
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
|
||||
VERIFY_RAISES_ASSERT(d_C = d_A * d_B);
|
||||
}
|
||||
|
||||
// ---- GEMM with explicit GpuContext ------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_explicit_context(Index m, Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, k);
|
||||
Mat B = Mat::Random(k, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuContext ctx;
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C.device(ctx) = d_A * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B;
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM cross-context reuse of the same destination -----------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_cross_context_reuse(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, n);
|
||||
Mat D = Mat::Random(n, n);
|
||||
Mat E = Mat::Random(n, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
|
||||
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
|
||||
|
||||
GpuContext ctx1;
|
||||
GpuContext ctx2;
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C.device(ctx1) = d_A * d_B;
|
||||
d_C.device(ctx2) += d_D * d_E;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B + D * E;
|
||||
|
||||
RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM cross-context resize of the destination ---------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_cross_context_resize() {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(64, 64);
|
||||
Mat B = Mat::Random(64, 64);
|
||||
Mat D = Mat::Random(32, 16);
|
||||
Mat E = Mat::Random(16, 8);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
|
||||
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
|
||||
|
||||
GpuContext ctx1;
|
||||
GpuContext ctx2;
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C.device(ctx1) = d_A * d_B;
|
||||
d_C.device(ctx2) = d_D * d_E;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = D * E;
|
||||
|
||||
RealScalar tol = RealScalar(16) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- GEMM chaining: C = (A * B) then D = C * E -----------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_chain(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, n);
|
||||
Mat E = Mat::Random(n, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A * d_B;
|
||||
DeviceMatrix<Scalar> d_D;
|
||||
d_D = d_C * d_E;
|
||||
|
||||
Mat D = d_D.toHost();
|
||||
Mat D_ref = (A * B) * E;
|
||||
|
||||
RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * D_ref.norm();
|
||||
VERIFY((D - D_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Square identity check: A * I = A ---------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_identity(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat eye = Mat::Identity(n, n);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_I = DeviceMatrix<Scalar>::fromHost(eye);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A * d_I;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
VERIFY_IS_APPROX(C, A);
|
||||
}
|
||||
|
||||
// ---- LLT solve expression: d_X = d_A.llt().solve(d_B) ----------------------
|
||||
|
||||
template <typename MatrixType>
|
||||
MatrixType make_spd(Index n) {
|
||||
using Scalar = typename MatrixType::Scalar;
|
||||
MatrixType M = MatrixType::Random(n, n);
|
||||
return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
void test_llt_solve_expr(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = make_spd<Mat>(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X = d_A.llt().solve(d_B);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- LLT solve with explicit context ----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_llt_solve_expr_context(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = make_spd<Mat>(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuContext ctx;
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X.device(ctx) = d_A.llt().solve(d_B);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_lu_solve_expr(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X = d_A.lu().solve(d_B);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_gemm_then_solve(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat D = Mat::Random(n, 1);
|
||||
|
||||
// Make SPD: C = A^H * A + n*I
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A.adjoint() * d_A;
|
||||
|
||||
// Add n*I on host (no element-wise ops on DeviceMatrix yet).
|
||||
Mat C = d_C.toHost();
|
||||
C += Mat::Identity(n, n) * static_cast<Scalar>(n);
|
||||
d_C = DeviceMatrix<Scalar>::fromHost(C);
|
||||
|
||||
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
|
||||
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X = d_C.llt().solve(d_D);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (C * X - D).norm() / D.norm();
|
||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- LLT solve with Upper triangle -----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_llt_solve_upper(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = make_spd<Mat>(n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X = d_A.template llt<Upper>().solve(d_B);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- LU solve with explicit context -----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_lu_solve_expr_context(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuContext ctx;
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X.device(ctx) = d_A.lu().solve(d_B);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- Zero-nrhs solver expressions ------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_llt_solve_zero_nrhs(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
|
||||
Mat A = make_spd<Mat>(n);
|
||||
Mat B = Mat::Random(n, 0);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X = d_A.llt().solve(d_B);
|
||||
|
||||
VERIFY_IS_EQUAL(d_X.rows(), n);
|
||||
VERIFY_IS_EQUAL(d_X.cols(), 0);
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
void test_lu_solve_zero_nrhs(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, 0);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X = d_A.lu().solve(d_B);
|
||||
|
||||
VERIFY_IS_EQUAL(d_X.rows(), n);
|
||||
VERIFY_IS_EQUAL(d_X.cols(), 0);
|
||||
}
|
||||
|
||||
// ---- TRSM: triangularView<UpLo>().solve(B) ----------------------------------
|
||||
|
||||
template <typename Scalar, int UpLo>
|
||||
void test_trsm(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
// Build a well-conditioned triangular matrix.
|
||||
Mat A = Mat::Random(n, n);
|
||||
A.diagonal().array() += static_cast<Scalar>(n); // ensure non-singular
|
||||
if (UpLo == Lower)
|
||||
A = A.template triangularView<Lower>();
|
||||
else
|
||||
A = A.template triangularView<Upper>();
|
||||
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_X;
|
||||
d_X = d_A.template triangularView<UpLo>().solve(d_B);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- SYMM/HEMM: selfadjointView<UpLo>() * B --------------------------------
|
||||
|
||||
template <typename Scalar, int UpLo>
|
||||
void test_symm(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = make_spd<Mat>(n); // SPD is also self-adjoint
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C = d_A.template selfadjointView<UpLo>() * d_B;
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B; // A is symmetric, so full multiply == symm
|
||||
|
||||
RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- SYRK/HERK: rankUpdate(A) → C = A * A^H --------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_syrk(Index n, Index k) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, k);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
|
||||
DeviceMatrix<Scalar> d_C;
|
||||
d_C.template selfadjointView<Lower>().rankUpdate(d_A);
|
||||
|
||||
Mat C = d_C.toHost();
|
||||
// Only lower triangle is meaningful for SYRK. Compare lower triangle.
|
||||
Mat C_ref = A * A.adjoint();
|
||||
|
||||
// Extract lower triangle for comparison.
|
||||
Mat C_lower = C.template triangularView<Lower>();
|
||||
Mat C_ref_lower = C_ref.template triangularView<Lower>();
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
VERIFY((C_lower - C_ref_lower).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver ------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
CALL_SUBTEST(test_gemm_basic<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_basic<Scalar>(128, 64, 32));
|
||||
CALL_SUBTEST(test_gemm_basic<Scalar>(1, 1, 1));
|
||||
CALL_SUBTEST(test_gemm_basic<Scalar>(256, 256, 256));
|
||||
|
||||
CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(128, 32, 64));
|
||||
|
||||
CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(128, 32, 64));
|
||||
|
||||
CALL_SUBTEST(test_gemm_scaled<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_scaled_rhs<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_accumulate<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_accumulate_empty<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_subtract<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_subtract_empty<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_dimension_mismatch<Scalar>());
|
||||
CALL_SUBTEST(test_gemm_explicit_context<Scalar>(64, 64, 64));
|
||||
CALL_SUBTEST(test_gemm_cross_context_reuse<Scalar>(64));
|
||||
CALL_SUBTEST(test_gemm_cross_context_resize<Scalar>());
|
||||
CALL_SUBTEST(test_gemm_chain<Scalar>(64));
|
||||
CALL_SUBTEST(test_gemm_identity<Scalar>(64));
|
||||
|
||||
// Solver expressions — zero-size edge cases (use dedicated tests, not residual-based)
|
||||
|
||||
// Solver expressions
|
||||
CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 1));
|
||||
CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_llt_solve_expr<Scalar>(256, 8));
|
||||
CALL_SUBTEST(test_llt_solve_expr_context<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_llt_solve_upper<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 1));
|
||||
CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_lu_solve_expr<Scalar>(256, 8));
|
||||
CALL_SUBTEST(test_lu_solve_expr_context<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(64));
|
||||
CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(0));
|
||||
CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(64));
|
||||
CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(0));
|
||||
CALL_SUBTEST(test_gemm_then_solve<Scalar>(64));
|
||||
|
||||
// TRSM
|
||||
CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 1)));
|
||||
CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 4)));
|
||||
CALL_SUBTEST((test_trsm<Scalar, Upper>(64, 4)));
|
||||
CALL_SUBTEST((test_trsm<Scalar, Lower>(256, 8)));
|
||||
|
||||
// SYMM/HEMM
|
||||
CALL_SUBTEST((test_symm<Scalar, Lower>(64, 4)));
|
||||
CALL_SUBTEST((test_symm<Scalar, Upper>(64, 4)));
|
||||
CALL_SUBTEST((test_symm<Scalar, Lower>(128, 8)));
|
||||
|
||||
// SYRK/HERK
|
||||
CALL_SUBTEST(test_syrk<Scalar>(64, 64));
|
||||
CALL_SUBTEST(test_syrk<Scalar>(64, 32));
|
||||
CALL_SUBTEST(test_syrk<Scalar>(128, 64));
|
||||
}
|
||||
|
||||
// ---- Solver failure mode tests (not templated on Scalar) --------------------
|
||||
|
||||
void test_llt_not_spd() {
|
||||
// Negative definite matrix — LLT factorization must fail.
|
||||
MatrixXd A = -MatrixXd::Identity(8, 8);
|
||||
MatrixXd B = MatrixXd::Random(8, 1);
|
||||
auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<double>::fromHost(B);
|
||||
DeviceMatrix<double> d_X;
|
||||
VERIFY_RAISES_ASSERT(d_X = d_A.llt().solve(d_B));
|
||||
}
|
||||
|
||||
void test_lu_singular() {
|
||||
// Zero matrix — LU factorization must detect singularity.
|
||||
MatrixXd A = MatrixXd::Zero(8, 8);
|
||||
MatrixXd B = MatrixXd::Random(8, 1);
|
||||
auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<double>::fromHost(B);
|
||||
DeviceMatrix<double> d_X;
|
||||
VERIFY_RAISES_ASSERT(d_X = d_A.lu().solve(d_B));
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cublas) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_llt_not_spd());
|
||||
CALL_SUBTEST(test_lu_singular());
|
||||
}
|
||||
210
test/gpu_cusolver_llt.cpp
Normal file
210
test/gpu_cusolver_llt.cpp
Normal file
@@ -0,0 +1,210 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Eigen Authors
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for GpuLLT: GPU Cholesky (LL^T) using cuSOLVER.
|
||||
// Covers cusolverDnXpotrf (factorization) and cusolverDnXpotrs (solve)
|
||||
// for float, double, complex<float>, complex<double>, Lower and Upper.
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/Cholesky>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// Build a random symmetric positive-definite matrix: A = M^H*M + n*I.
|
||||
template <typename MatrixType>
|
||||
MatrixType make_spd(Index n) {
|
||||
using Scalar = typename MatrixType::Scalar;
|
||||
MatrixType M = MatrixType::Random(n, n);
|
||||
return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
|
||||
}
|
||||
|
||||
// Test factorization: L*L^H must reconstruct A to within floating-point tolerance.
|
||||
template <typename Scalar, int UpLo>
|
||||
void test_potrf(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = make_spd<MatrixType>(n);
|
||||
|
||||
GpuLLT<Scalar, UpLo> llt(A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
// Reconstruct L*L^H and compare to original A.
|
||||
// GpuLLT stores the factor on device; use CPU LLT to get the triangular factor
|
||||
// for reconstruction since GpuLLT does not expose the device-resident factor directly.
|
||||
LLT<MatrixType, UpLo> ref(A);
|
||||
VERIFY_IS_EQUAL(ref.info(), Success);
|
||||
MatrixType A_reconstructed = ref.reconstructedMatrix();
|
||||
|
||||
// Both should equal A to within n*eps*||A||.
|
||||
RealScalar tol = RealScalar(4) * RealScalar(n) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||
VERIFY((A_reconstructed - A).norm() < tol);
|
||||
|
||||
// Smoke-test: llt.solve(b) should return the same result as ref.solve(b).
|
||||
MatrixType b = MatrixType::Random(n, 1);
|
||||
MatrixType x_gpu = llt.solve(b);
|
||||
MatrixType x_cpu = ref.solve(b);
|
||||
VERIFY((x_gpu - x_cpu).norm() < tol);
|
||||
}
|
||||
|
||||
// Test solve: residual ||A*X - B|| / ||B|| must be small.
|
||||
template <typename Scalar, int UpLo>
|
||||
void test_potrs(Index n, Index nrhs) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = make_spd<MatrixType>(n);
|
||||
MatrixType B = MatrixType::Random(n, nrhs);
|
||||
|
||||
GpuLLT<Scalar, UpLo> llt(A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
MatrixType X = llt.solve(B);
|
||||
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY(residual < tol);
|
||||
}
|
||||
|
||||
// Test that multiple solves against the same factor all produce correct results.
|
||||
// This exercises the key design property: L stays on device across calls.
|
||||
template <typename Scalar>
|
||||
void test_multiple_solves(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = make_spd<MatrixType>(n);
|
||||
GpuLLT<Scalar, Lower> llt(A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
MatrixType B = MatrixType::Random(n, 3);
|
||||
MatrixType X = llt.solve(B);
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < tol);
|
||||
}
|
||||
}
|
||||
|
||||
// Test that GpuLLT correctly detects a non-SPD matrix.
|
||||
void test_not_spd() {
|
||||
MatrixXd A = -MatrixXd::Identity(8, 8); // negative definite
|
||||
GpuLLT<double> llt(A);
|
||||
VERIFY_IS_EQUAL(llt.info(), NumericalIssue);
|
||||
}
|
||||
|
||||
// ---- DeviceMatrix integration tests -----------------------------------------
|
||||
|
||||
// compute(DeviceMatrix) + solve(DeviceMatrix) → toHost
|
||||
template <typename Scalar, int UpLo>
|
||||
void test_device_matrix_solve(Index n, Index nrhs) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = make_spd<MatrixType>(n);
|
||||
MatrixType B = MatrixType::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuLLT<Scalar, UpLo> llt;
|
||||
llt.compute(d_A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
MatrixType X = d_X.toHost();
|
||||
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// compute(DeviceMatrix&&) — move path
|
||||
template <typename Scalar>
|
||||
void test_device_matrix_move_compute(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = make_spd<MatrixType>(n);
|
||||
MatrixType B = MatrixType::Random(n, 1);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
GpuLLT<Scalar, Lower> llt;
|
||||
llt.compute(std::move(d_A));
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
// d_A should be empty after move.
|
||||
VERIFY(d_A.empty());
|
||||
|
||||
MatrixType X = llt.solve(B);
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// Full async chain: compute → solve → solve again with result as RHS → toHost
|
||||
template <typename Scalar>
|
||||
void test_chaining(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = make_spd<MatrixType>(n);
|
||||
MatrixType B = MatrixType::Random(n, 3);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuLLT<Scalar, Lower> llt;
|
||||
llt.compute(d_A);
|
||||
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||
|
||||
// Chain: solve → use result as RHS for another solve
|
||||
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||
DeviceMatrix<Scalar> d_Y = llt.solve(d_X);
|
||||
|
||||
// Only sync at the very end.
|
||||
MatrixType Y = d_Y.toHost();
|
||||
|
||||
// Verify: Y = A^{-2} * B
|
||||
MatrixType X_ref = LLT<MatrixType, Lower>(A).solve(B);
|
||||
MatrixType Y_ref = LLT<MatrixType, Lower>(A).solve(X_ref);
|
||||
|
||||
RealScalar tol = RealScalar(4) * RealScalar(n) * NumTraits<Scalar>::epsilon() * Y_ref.norm();
|
||||
VERIFY((Y - Y_ref).norm() < tol);
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
CALL_SUBTEST((test_potrf<Scalar, Lower>(1)));
|
||||
CALL_SUBTEST((test_potrf<Scalar, Lower>(64)));
|
||||
CALL_SUBTEST((test_potrf<Scalar, Lower>(256)));
|
||||
CALL_SUBTEST((test_potrf<Scalar, Upper>(64)));
|
||||
CALL_SUBTEST((test_potrf<Scalar, Upper>(256)));
|
||||
|
||||
CALL_SUBTEST((test_potrs<Scalar, Lower>(64, 1)));
|
||||
CALL_SUBTEST((test_potrs<Scalar, Lower>(64, 4)));
|
||||
CALL_SUBTEST((test_potrs<Scalar, Lower>(256, 8)));
|
||||
CALL_SUBTEST((test_potrs<Scalar, Upper>(64, 1)));
|
||||
CALL_SUBTEST((test_potrs<Scalar, Upper>(256, 4)));
|
||||
|
||||
CALL_SUBTEST(test_multiple_solves<Scalar>(128));
|
||||
|
||||
CALL_SUBTEST((test_device_matrix_solve<Scalar, Lower>(64, 4)));
|
||||
CALL_SUBTEST((test_device_matrix_solve<Scalar, Upper>(128, 1)));
|
||||
CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));
|
||||
CALL_SUBTEST(test_chaining<Scalar>(64));
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cusolver_llt) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_not_spd());
|
||||
}
|
||||
206
test/gpu_cusolver_lu.cpp
Normal file
206
test/gpu_cusolver_lu.cpp
Normal file
@@ -0,0 +1,206 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Eigen Authors
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for GpuLU: GPU partial-pivoting LU decomposition via cuSOLVER.
|
||||
// Covers cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve)
|
||||
// for float, double, complex<float>, complex<double>.
|
||||
//
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/LU>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// ---- Test factorization + NoTrans solve: residual ||A*X - B|| / ||B|| -------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_getrf(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = MatrixType::Random(n, n);
|
||||
MatrixType B = MatrixType::Random(n, 4);
|
||||
|
||||
GpuLU<Scalar> lu(A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
|
||||
MatrixType X = lu.solve(B);
|
||||
// Backward error bound for LU: ||A*X - B|| <= O(n*u) * ||A|| * ||X||.
|
||||
// Normalize by ||A||*||X|| rather than ||B|| to be condition-number agnostic.
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- Test solve: A^T*X = B and A^H*X = B ------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_getrs_trans(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = MatrixType::Random(n, n);
|
||||
MatrixType B = MatrixType::Random(n, 3);
|
||||
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
|
||||
GpuLU<Scalar> lu(A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
|
||||
MatrixType Xt = lu.solve(B, GpuLU<Scalar>::Transpose);
|
||||
VERIFY((A.transpose() * Xt - B).norm() / (A.norm() * Xt.norm()) < tol);
|
||||
|
||||
MatrixType Xc = lu.solve(B, GpuLU<Scalar>::ConjugateTranspose);
|
||||
VERIFY((A.adjoint() * Xc - B).norm() / (A.norm() * Xc.norm()) < tol);
|
||||
}
|
||||
|
||||
// ---- Test multiple solves reuse the device-resident LU ----------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_multiple_solves(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = MatrixType::Random(n, n);
|
||||
GpuLU<Scalar> lu(A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
|
||||
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
MatrixType B = MatrixType::Random(n, 3);
|
||||
MatrixType X = lu.solve(B);
|
||||
VERIFY((A * X - B).norm() / (A.norm() * X.norm()) < tol);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Agreement with CPU PartialPivLU ----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_vs_cpu(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = MatrixType::Random(n, n);
|
||||
MatrixType B = MatrixType::Random(n, 5);
|
||||
|
||||
GpuLU<Scalar> gpu_lu(A);
|
||||
VERIFY_IS_EQUAL(gpu_lu.info(), Success);
|
||||
|
||||
MatrixType X_gpu = gpu_lu.solve(B);
|
||||
MatrixType X_cpu = PartialPivLU<MatrixType>(A).solve(B);
|
||||
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Singular matrix detection ----------------------------------------------
|
||||
|
||||
void test_singular() {
|
||||
MatrixXd A = MatrixXd::Zero(8, 8);
|
||||
GpuLU<double> lu(A);
|
||||
VERIFY_IS_EQUAL(lu.info(), NumericalIssue);
|
||||
}
|
||||
|
||||
// ---- DeviceMatrix integration tests -----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_device_matrix_solve(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = MatrixType::Random(n, n);
|
||||
MatrixType B = MatrixType::Random(n, 4);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuLU<Scalar> lu;
|
||||
lu.compute(d_A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
|
||||
DeviceMatrix<Scalar> d_X = lu.solve(d_B);
|
||||
MatrixType X = d_X.toHost();
|
||||
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
void test_device_matrix_move_compute(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = MatrixType::Random(n, n);
|
||||
MatrixType B = MatrixType::Random(n, 1);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
GpuLU<Scalar> lu;
|
||||
lu.compute(std::move(d_A));
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
VERIFY(d_A.empty());
|
||||
|
||||
MatrixType X = lu.solve(B);
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
template <typename Scalar>
|
||||
void test_chaining(Index n) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
MatrixType A = MatrixType::Random(n, n);
|
||||
MatrixType B = MatrixType::Random(n, 3);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuLU<Scalar> lu;
|
||||
lu.compute(d_A);
|
||||
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||
|
||||
// Chain: solve → use result as RHS
|
||||
DeviceMatrix<Scalar> d_X = lu.solve(d_B);
|
||||
DeviceMatrix<Scalar> d_Y = lu.solve(d_X);
|
||||
MatrixType Y = d_Y.toHost();
|
||||
|
||||
MatrixType X_ref = PartialPivLU<MatrixType>(A).solve(B);
|
||||
MatrixType Y_ref = PartialPivLU<MatrixType>(A).solve(X_ref);
|
||||
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon() * Y_ref.norm();
|
||||
VERIFY((Y - Y_ref).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver -------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
CALL_SUBTEST(test_getrf<Scalar>(1));
|
||||
CALL_SUBTEST(test_getrf<Scalar>(64));
|
||||
CALL_SUBTEST(test_getrf<Scalar>(256));
|
||||
|
||||
CALL_SUBTEST(test_getrs_trans<Scalar>(64));
|
||||
CALL_SUBTEST(test_getrs_trans<Scalar>(128));
|
||||
|
||||
CALL_SUBTEST(test_multiple_solves<Scalar>(128));
|
||||
|
||||
CALL_SUBTEST(test_vs_cpu<Scalar>(64));
|
||||
CALL_SUBTEST(test_vs_cpu<Scalar>(256));
|
||||
|
||||
CALL_SUBTEST(test_device_matrix_solve<Scalar>(64));
|
||||
CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));
|
||||
CALL_SUBTEST(test_chaining<Scalar>(64));
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cusolver_lu) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_singular());
|
||||
}
|
||||
247
test/gpu_device_matrix.cpp
Normal file
247
test/gpu_device_matrix.cpp
Normal file
@@ -0,0 +1,247 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for DeviceMatrix and HostTransfer: typed RAII GPU memory wrapper.
|
||||
// No cuSOLVER dependency — only CUDA runtime.
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// ---- Default construction ---------------------------------------------------
|
||||
|
||||
void test_default_construct() {
|
||||
DeviceMatrix<double> dm;
|
||||
VERIFY(dm.empty());
|
||||
VERIFY_IS_EQUAL(dm.rows(), 0);
|
||||
VERIFY_IS_EQUAL(dm.cols(), 0);
|
||||
VERIFY(dm.data() == nullptr);
|
||||
VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(0));
|
||||
}
|
||||
|
||||
// ---- Allocate uninitialized -------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_allocate(Index rows, Index cols) {
|
||||
DeviceMatrix<Scalar> dm(rows, cols);
|
||||
VERIFY(!dm.empty());
|
||||
VERIFY_IS_EQUAL(dm.rows(), rows);
|
||||
VERIFY_IS_EQUAL(dm.cols(), cols);
|
||||
VERIFY_IS_EQUAL(dm.outerStride(), rows);
|
||||
VERIFY(dm.data() != nullptr);
|
||||
VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar));
|
||||
}
|
||||
|
||||
// ---- fromHost / toHost roundtrip (synchronous) ------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_roundtrip(Index rows, Index cols) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
MatrixType host = MatrixType::Random(rows, cols);
|
||||
|
||||
auto dm = DeviceMatrix<Scalar>::fromHost(host);
|
||||
VERIFY_IS_EQUAL(dm.rows(), rows);
|
||||
VERIFY_IS_EQUAL(dm.cols(), cols);
|
||||
VERIFY(!dm.empty());
|
||||
|
||||
MatrixType result = dm.toHost();
|
||||
VERIFY_IS_EQUAL(result.rows(), rows);
|
||||
VERIFY_IS_EQUAL(result.cols(), cols);
|
||||
VERIFY_IS_APPROX(result, host);
|
||||
}
|
||||
|
||||
// ---- fromHostAsync / toHostAsync roundtrip -----------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_roundtrip_async(Index rows, Index cols) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
MatrixType host = MatrixType::Random(rows, cols);
|
||||
|
||||
cudaStream_t stream;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream));
|
||||
|
||||
// Async upload from raw pointer.
|
||||
auto dm = DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, stream);
|
||||
VERIFY_IS_EQUAL(dm.rows(), rows);
|
||||
VERIFY_IS_EQUAL(dm.cols(), cols);
|
||||
|
||||
// Async download via HostTransfer future.
|
||||
auto transfer = dm.toHostAsync(stream);
|
||||
|
||||
// get() blocks and returns the matrix.
|
||||
MatrixType result = transfer.get();
|
||||
VERIFY_IS_APPROX(result, host);
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(stream));
|
||||
}
|
||||
|
||||
// ---- HostTransfer::ready() and idempotent get() -----------------------------
|
||||
|
||||
void test_host_transfer_ready() {
|
||||
using MatrixType = Matrix<double, Dynamic, Dynamic>;
|
||||
MatrixType host = MatrixType::Random(100, 100);
|
||||
|
||||
auto dm = DeviceMatrix<double>::fromHost(host);
|
||||
auto transfer = dm.toHostAsync();
|
||||
|
||||
// After get(), ready() must return true.
|
||||
MatrixType result = transfer.get();
|
||||
VERIFY(transfer.ready());
|
||||
VERIFY_IS_APPROX(result, host);
|
||||
|
||||
// get() is idempotent.
|
||||
MatrixType& result2 = transfer.get();
|
||||
VERIFY_IS_APPROX(result2, host);
|
||||
}
|
||||
|
||||
// ---- HostTransfer move ------------------------------------------------------
|
||||
|
||||
void test_host_transfer_move() {
|
||||
using MatrixType = Matrix<double, Dynamic, Dynamic>;
|
||||
MatrixType host = MatrixType::Random(50, 50);
|
||||
|
||||
auto dm = DeviceMatrix<double>::fromHost(host);
|
||||
auto transfer = dm.toHostAsync();
|
||||
|
||||
HostTransfer<double> moved(std::move(transfer));
|
||||
MatrixType result = moved.get();
|
||||
VERIFY_IS_APPROX(result, host);
|
||||
}
|
||||
|
||||
// ---- clone() produces independent copy --------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_clone(Index rows, Index cols) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
MatrixType host = MatrixType::Random(rows, cols);
|
||||
|
||||
auto dm = DeviceMatrix<Scalar>::fromHost(host);
|
||||
auto cloned = dm.clone();
|
||||
|
||||
// Overwrite original with different data.
|
||||
MatrixType other = MatrixType::Random(rows, cols);
|
||||
dm = DeviceMatrix<Scalar>::fromHost(other);
|
||||
|
||||
// Clone still holds the original data.
|
||||
MatrixType clone_result = cloned.toHost();
|
||||
VERIFY_IS_APPROX(clone_result, host);
|
||||
|
||||
// Original holds the new data.
|
||||
MatrixType dm_result = dm.toHost();
|
||||
VERIFY_IS_APPROX(dm_result, other);
|
||||
}
|
||||
|
||||
// ---- Move construct ---------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_move_construct(Index rows, Index cols) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
MatrixType host = MatrixType::Random(rows, cols);
|
||||
|
||||
auto dm = DeviceMatrix<Scalar>::fromHost(host);
|
||||
DeviceMatrix<Scalar> moved(std::move(dm));
|
||||
|
||||
VERIFY(dm.empty());
|
||||
VERIFY(dm.data() == nullptr);
|
||||
|
||||
VERIFY_IS_EQUAL(moved.rows(), rows);
|
||||
VERIFY_IS_EQUAL(moved.cols(), cols);
|
||||
MatrixType result = moved.toHost();
|
||||
VERIFY_IS_APPROX(result, host);
|
||||
}
|
||||
|
||||
// ---- Move assign ------------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_move_assign(Index rows, Index cols) {
|
||||
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
MatrixType host = MatrixType::Random(rows, cols);
|
||||
|
||||
auto dm = DeviceMatrix<Scalar>::fromHost(host);
|
||||
DeviceMatrix<Scalar> dest;
|
||||
dest = std::move(dm);
|
||||
|
||||
VERIFY(dm.empty());
|
||||
VERIFY_IS_EQUAL(dest.rows(), rows);
|
||||
MatrixType result = dest.toHost();
|
||||
VERIFY_IS_APPROX(result, host);
|
||||
}
|
||||
|
||||
// ---- resize() ---------------------------------------------------------------
|
||||
|
||||
void test_resize() {
|
||||
DeviceMatrix<double> dm(10, 20);
|
||||
VERIFY_IS_EQUAL(dm.rows(), 10);
|
||||
VERIFY_IS_EQUAL(dm.cols(), 20);
|
||||
|
||||
dm.resize(50, 30);
|
||||
VERIFY_IS_EQUAL(dm.rows(), 50);
|
||||
VERIFY_IS_EQUAL(dm.cols(), 30);
|
||||
VERIFY_IS_EQUAL(dm.outerStride(), 50);
|
||||
VERIFY(dm.data() != nullptr);
|
||||
|
||||
// Resize to same dimensions is a no-op.
|
||||
double* ptr_before = dm.data();
|
||||
dm.resize(50, 30);
|
||||
VERIFY(dm.data() == ptr_before);
|
||||
}
|
||||
|
||||
// ---- Empty / 0x0 matrix -----------------------------------------------------
|
||||
|
||||
void test_empty() {
|
||||
using MatrixType = Matrix<double, Dynamic, Dynamic>;
|
||||
MatrixType empty_mat(0, 0);
|
||||
|
||||
auto dm = DeviceMatrix<double>::fromHost(empty_mat);
|
||||
VERIFY(dm.empty());
|
||||
VERIFY_IS_EQUAL(dm.rows(), 0);
|
||||
VERIFY_IS_EQUAL(dm.cols(), 0);
|
||||
|
||||
MatrixType result = dm.toHost();
|
||||
VERIFY_IS_EQUAL(result.rows(), 0);
|
||||
VERIFY_IS_EQUAL(result.cols(), 0);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver ------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
// Square.
|
||||
CALL_SUBTEST(test_roundtrip<Scalar>(1, 1));
|
||||
CALL_SUBTEST(test_roundtrip<Scalar>(64, 64));
|
||||
CALL_SUBTEST(test_roundtrip<Scalar>(256, 256));
|
||||
|
||||
// Rectangular.
|
||||
CALL_SUBTEST(test_roundtrip<Scalar>(100, 7));
|
||||
CALL_SUBTEST(test_roundtrip<Scalar>(7, 100));
|
||||
|
||||
// Async roundtrip.
|
||||
CALL_SUBTEST(test_roundtrip_async<Scalar>(64, 64));
|
||||
CALL_SUBTEST(test_roundtrip_async<Scalar>(100, 7));
|
||||
|
||||
CALL_SUBTEST(test_clone<Scalar>(64, 64));
|
||||
CALL_SUBTEST(test_move_construct<Scalar>(64, 64));
|
||||
CALL_SUBTEST(test_move_assign<Scalar>(64, 64));
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_device_matrix) {
|
||||
CALL_SUBTEST(test_default_construct());
|
||||
CALL_SUBTEST(test_empty());
|
||||
CALL_SUBTEST(test_resize());
|
||||
CALL_SUBTEST(test_host_transfer_ready());
|
||||
CALL_SUBTEST(test_host_transfer_move());
|
||||
CALL_SUBTEST((test_allocate<float>(100, 50)));
|
||||
CALL_SUBTEST((test_allocate<double>(100, 50)));
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
}
|
||||
110
test/gpu_library_example.cu
Normal file
110
test/gpu_library_example.cu
Normal file
@@ -0,0 +1,110 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Smoke test for GPU library test infrastructure.
|
||||
// Verifies GpuContext, GpuBuffer, and host<->device matrix transfers
|
||||
// without requiring any NVIDIA library (cuBLAS, cuSOLVER, etc.).
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include "gpu_context.h"
|
||||
#include "gpu_library_test_helper.h"
|
||||
|
||||
using namespace Eigen;
|
||||
using namespace Eigen::test;
|
||||
|
||||
// Test that GpuContext initializes, reports valid device info, and owns a cuSOLVER handle.
|
||||
void test_gpu_context() {
|
||||
GpuContext ctx;
|
||||
VERIFY(ctx.device() >= 0);
|
||||
VERIFY(ctx.deviceProperties().major >= 7); // sm_70 minimum
|
||||
VERIFY(ctx.stream != nullptr);
|
||||
VERIFY(ctx.cusolver != nullptr);
|
||||
std::cout << " GPU: " << ctx.deviceProperties().name << " (sm_" << ctx.deviceProperties().major
|
||||
<< ctx.deviceProperties().minor << ")\n";
|
||||
}
|
||||
|
||||
// Test dense matrix roundtrip: host -> device -> host.
|
||||
template <typename MatrixType>
|
||||
void test_dense_roundtrip() {
|
||||
GpuContext ctx;
|
||||
const Index rows = 64;
|
||||
const Index cols = 32;
|
||||
|
||||
MatrixType A = MatrixType::Random(rows, cols);
|
||||
auto buf = gpu_copy_to_device(ctx.stream, A);
|
||||
VERIFY(buf.data != nullptr);
|
||||
VERIFY(buf.size == rows * cols);
|
||||
|
||||
MatrixType B(rows, cols);
|
||||
B.setZero();
|
||||
gpu_copy_to_host(ctx.stream, buf, B);
|
||||
ctx.synchronize();
|
||||
|
||||
VERIFY_IS_EQUAL(A, B);
|
||||
}
|
||||
|
||||
// Test GpuBuffer RAII: move semantics, async zero-init.
|
||||
void test_gpu_buffer() {
|
||||
GpuContext ctx;
|
||||
|
||||
GpuBuffer<float> a(128);
|
||||
VERIFY(a.data != nullptr);
|
||||
VERIFY(a.size == 128);
|
||||
|
||||
// Move construction.
|
||||
GpuBuffer<float> b(std::move(a));
|
||||
VERIFY(a.data == nullptr);
|
||||
VERIFY(b.data != nullptr);
|
||||
VERIFY(b.size == 128);
|
||||
|
||||
// Move assignment.
|
||||
GpuBuffer<float> c;
|
||||
c = std::move(b);
|
||||
VERIFY(b.data == nullptr);
|
||||
VERIFY(c.data != nullptr);
|
||||
|
||||
// setZeroAsync.
|
||||
c.setZeroAsync(ctx.stream);
|
||||
ctx.synchronize();
|
||||
|
||||
std::vector<float> host(128);
|
||||
GPU_CHECK(cudaMemcpy(host.data(), c.data, 128 * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
for (int i = 0; i < 128; ++i) {
|
||||
VERIFY_IS_EQUAL(host[i], 0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
// Test with vectors (1D).
|
||||
template <typename Scalar>
|
||||
void test_vector_roundtrip() {
|
||||
GpuContext ctx;
|
||||
const Index n = 256;
|
||||
Matrix<Scalar, Dynamic, 1> v = Matrix<Scalar, Dynamic, 1>::Random(n);
|
||||
auto buf = gpu_copy_to_device(ctx.stream, v);
|
||||
|
||||
Matrix<Scalar, Dynamic, 1> w(n);
|
||||
w.setZero();
|
||||
gpu_copy_to_host(ctx.stream, buf, w);
|
||||
ctx.synchronize();
|
||||
|
||||
VERIFY_IS_EQUAL(v, w);
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_library_example) {
|
||||
CALL_SUBTEST(test_gpu_context());
|
||||
CALL_SUBTEST(test_gpu_buffer());
|
||||
CALL_SUBTEST(test_dense_roundtrip<MatrixXf>());
|
||||
CALL_SUBTEST(test_dense_roundtrip<MatrixXd>());
|
||||
CALL_SUBTEST((test_dense_roundtrip<Matrix<float, Dynamic, Dynamic, RowMajor>>()));
|
||||
CALL_SUBTEST((test_dense_roundtrip<Matrix<double, Dynamic, Dynamic, RowMajor>>()));
|
||||
CALL_SUBTEST(test_vector_roundtrip<float>());
|
||||
CALL_SUBTEST(test_vector_roundtrip<double>());
|
||||
CALL_SUBTEST(test_vector_roundtrip<std::complex<float>>());
|
||||
}
|
||||
90
test/gpu_library_test_helper.h
Normal file
90
test/gpu_library_test_helper.h
Normal file
@@ -0,0 +1,90 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
|
||||
#define EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
|
||||
|
||||
// Helpers for GPU tests that call NVIDIA library APIs (cuBLAS, cuSOLVER, etc.)
|
||||
// from the host side. Provides RAII GPU memory management and async matrix transfer.
|
||||
//
|
||||
// This is separate from gpu_common.h (element-parallel device kernels) and
|
||||
// gpu_test_helper.h (serialization-based device kernels). Those patterns run
|
||||
// user functors inside GPU kernels. This helper is for host-orchestrated tests
|
||||
// that call library APIs which launch their own kernels internally.
|
||||
//
|
||||
// All transfers use an explicit stream and cudaMemcpyAsync. Callers must
|
||||
// synchronize (ctx.synchronize() or cudaStreamSynchronize) before reading
|
||||
// results back on the host.
|
||||
|
||||
#include "gpu_test_helper.h"
|
||||
|
||||
namespace Eigen {
|
||||
namespace test {
|
||||
|
||||
// RAII wrapper for GPU device memory. Prevents leaks when VERIFY macros abort.
|
||||
template <typename Scalar>
|
||||
struct GpuBuffer {
|
||||
Scalar* data = nullptr;
|
||||
Index size = 0;
|
||||
|
||||
GpuBuffer() = default;
|
||||
|
||||
explicit GpuBuffer(Index n) : size(n) { GPU_CHECK(gpuMalloc(reinterpret_cast<void**>(&data), n * sizeof(Scalar))); }
|
||||
|
||||
~GpuBuffer() {
|
||||
if (data) GPU_CHECK(gpuFree(data));
|
||||
}
|
||||
|
||||
// Move-only.
|
||||
GpuBuffer(GpuBuffer&& other) noexcept : data(other.data), size(other.size) {
|
||||
other.data = nullptr;
|
||||
other.size = 0;
|
||||
}
|
||||
GpuBuffer& operator=(GpuBuffer&& other) noexcept {
|
||||
if (this != &other) {
|
||||
if (data) GPU_CHECK(gpuFree(data));
|
||||
data = other.data;
|
||||
size = other.size;
|
||||
other.data = nullptr;
|
||||
other.size = 0;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
GpuBuffer(const GpuBuffer&) = delete;
|
||||
GpuBuffer& operator=(const GpuBuffer&) = delete;
|
||||
|
||||
// Async zero the buffer on the given stream.
|
||||
void setZeroAsync(cudaStream_t stream) { GPU_CHECK(cudaMemsetAsync(data, 0, size * sizeof(Scalar), stream)); }
|
||||
};
|
||||
|
||||
// Copy a dense Eigen matrix to a new GPU buffer, async on the given stream.
|
||||
// Caller must synchronize before the host matrix is freed or modified.
|
||||
template <typename Derived>
|
||||
GpuBuffer<typename Derived::Scalar> gpu_copy_to_device(cudaStream_t stream, const MatrixBase<Derived>& host_mat) {
|
||||
using Scalar = typename Derived::Scalar;
|
||||
const auto& mat = host_mat.derived();
|
||||
GpuBuffer<Scalar> buf(mat.size());
|
||||
GPU_CHECK(cudaMemcpyAsync(buf.data, mat.data(), mat.size() * sizeof(Scalar), cudaMemcpyHostToDevice, stream));
|
||||
return buf;
|
||||
}
|
||||
|
||||
// Copy GPU buffer contents back to a dense Eigen matrix, async on the given stream.
|
||||
// Caller must synchronize before reading from host_mat.
|
||||
template <typename Scalar, typename Derived>
|
||||
void gpu_copy_to_host(cudaStream_t stream, const GpuBuffer<Scalar>& buf, MatrixBase<Derived>& host_mat) {
|
||||
auto& mat = host_mat.derived();
|
||||
eigen_assert(buf.size == mat.size());
|
||||
GPU_CHECK(cudaMemcpyAsync(mat.data(), buf.data, mat.size() * sizeof(Scalar), cudaMemcpyDeviceToHost, stream));
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
|
||||
Reference in New Issue
Block a user