mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Compare commits
1 Commits
gpu-librar
...
gpu-dense-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8593c7f5a1 |
@@ -47,6 +47,9 @@
|
||||
#include "src/GPU/DeviceDispatch.h"
|
||||
#include "src/GPU/GpuLLT.h"
|
||||
#include "src/GPU/GpuLU.h"
|
||||
#include "src/GPU/GpuQR.h"
|
||||
#include "src/GPU/GpuSVD.h"
|
||||
#include "src/GPU/GpuEigenSolver.h"
|
||||
// IWYU pragma: end_exports
|
||||
#endif
|
||||
|
||||
|
||||
@@ -51,25 +51,69 @@ constexpr cublasOperation_t to_cublas_op(GpuOp op) {
|
||||
|
||||
// ---- Scalar → cublasComputeType_t -------------------------------------------
|
||||
// cublasGemmEx requires a compute type (separate from the data type).
|
||||
//
|
||||
// Precision policy:
|
||||
// - Default: tensor core algorithms enabled (CUBLAS_GEMM_DEFAULT_TENSOR_OP).
|
||||
// For double, cuBLAS may use Ozaki emulation on sm_80+ tensor cores.
|
||||
// - EIGEN_CUDA_TF32: opt-in to TF32 for float (~2x faster, 10-bit mantissa).
|
||||
// - EIGEN_NO_CUDA_TENSOR_OPS: disables all tensor core usage. Uses pedantic
|
||||
// compute types and CUBLAS_GEMM_DEFAULT algorithm. For bit-exact reproducibility.
|
||||
|
||||
template <typename Scalar>
|
||||
struct cuda_compute_type;
|
||||
|
||||
template <>
|
||||
struct cuda_compute_type<float> {
|
||||
#if defined(EIGEN_NO_CUDA_TENSOR_OPS)
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_PEDANTIC;
|
||||
#elif defined(EIGEN_CUDA_TF32)
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_FAST_TF32;
|
||||
#else
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
|
||||
#endif
|
||||
};
|
||||
template <>
|
||||
struct cuda_compute_type<double> {
|
||||
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F_PEDANTIC;
|
||||
#else
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
|
||||
#endif
|
||||
};
|
||||
template <>
|
||||
struct cuda_compute_type<std::complex<float>> {
|
||||
#if defined(EIGEN_NO_CUDA_TENSOR_OPS)
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_PEDANTIC;
|
||||
#elif defined(EIGEN_CUDA_TF32)
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_FAST_TF32;
|
||||
#else
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
|
||||
#endif
|
||||
};
|
||||
template <>
|
||||
struct cuda_compute_type<std::complex<double>> {
|
||||
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F_PEDANTIC;
|
||||
#else
|
||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
|
||||
#endif
|
||||
};
|
||||
// ---- GEMM algorithm hint ----------------------------------------------------
|
||||
|
||||
constexpr cublasGemmAlgo_t cuda_gemm_algo() {
|
||||
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
|
||||
return CUBLAS_GEMM_DEFAULT;
|
||||
#else
|
||||
return CUBLAS_GEMM_DEFAULT_TENSOR_OP;
|
||||
#endif
|
||||
}
|
||||
|
||||
// ---- Alpha/beta scalar type for cublasGemmEx --------------------------------
|
||||
// For standard types, alpha/beta match the scalar type.
|
||||
|
||||
template <typename Scalar>
|
||||
struct cuda_gemm_scalar {
|
||||
using type = Scalar;
|
||||
};
|
||||
|
||||
// ---- Type-specific cuBLAS wrappers ------------------------------------------
|
||||
@@ -154,6 +198,35 @@ inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cubla
|
||||
reinterpret_cast<cuDoubleComplex*>(C), ldc);
|
||||
}
|
||||
|
||||
// GEAM wrappers: C = alpha * op(A) + beta * op(B)
|
||||
// Covers transpose, scale, matrix add/subtract in one call.
|
||||
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||
const float* alpha, const float* A, int lda, const float* beta, const float* B,
|
||||
int ldb, float* C, int ldc) {
|
||||
return cublasSgeam(h, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||
const double* alpha, const double* A, int lda, const double* beta, const double* B,
|
||||
int ldb, double* C, int ldc) {
|
||||
return cublasDgeam(h, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||
const std::complex<float>* alpha, const std::complex<float>* A, int lda,
|
||||
const std::complex<float>* beta, const std::complex<float>* B, int ldb,
|
||||
std::complex<float>* C, int ldc) {
|
||||
return cublasCgeam(h, transa, transb, m, n, reinterpret_cast<const cuComplex*>(alpha),
|
||||
reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<const cuComplex*>(beta),
|
||||
reinterpret_cast<const cuComplex*>(B), ldb, reinterpret_cast<cuComplex*>(C), ldc);
|
||||
}
|
||||
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||
const std::complex<double>* alpha, const std::complex<double>* A, int lda,
|
||||
const std::complex<double>* beta, const std::complex<double>* B, int ldb,
|
||||
std::complex<double>* C, int ldc) {
|
||||
return cublasZgeam(h, transa, transb, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
|
||||
reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<const cuDoubleComplex*>(beta),
|
||||
reinterpret_cast<const cuDoubleComplex*>(B), ldb, reinterpret_cast<cuDoubleComplex*>(C), ldc);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
|
||||
@@ -91,6 +91,68 @@ struct cusolver_fill_mode<Upper, RowMajor> {
|
||||
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
|
||||
};
|
||||
|
||||
// ---- Type-specific cuSOLVER wrappers ----------------------------------------
|
||||
// cuSOLVER does not provide generic X variants for ormqr/unmqr. These overloaded
|
||||
// wrappers dispatch to the correct type-specific function.
|
||||
// For real types: ormqr (orthogonal Q). For complex types: unmqr (unitary Q).
|
||||
|
||||
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||
int n, int k, const float* A, int lda, const float* tau, float* C, int ldc,
|
||||
float* work, int lwork, int* info) {
|
||||
return cusolverDnSormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
|
||||
}
|
||||
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||
int n, int k, const double* A, int lda, const double* tau, double* C, int ldc,
|
||||
double* work, int lwork, int* info) {
|
||||
return cusolverDnDormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
|
||||
}
|
||||
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||
int n, int k, const std::complex<float>* A, int lda,
|
||||
const std::complex<float>* tau, std::complex<float>* C, int ldc,
|
||||
std::complex<float>* work, int lwork, int* info) {
|
||||
return cusolverDnCunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
|
||||
reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<cuComplex*>(C), ldc,
|
||||
reinterpret_cast<cuComplex*>(work), lwork, info);
|
||||
}
|
||||
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||
int n, int k, const std::complex<double>* A, int lda,
|
||||
const std::complex<double>* tau, std::complex<double>* C, int ldc,
|
||||
std::complex<double>* work, int lwork, int* info) {
|
||||
return cusolverDnZunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
|
||||
reinterpret_cast<const cuDoubleComplex*>(tau), reinterpret_cast<cuDoubleComplex*>(C), ldc,
|
||||
reinterpret_cast<cuDoubleComplex*>(work), lwork, info);
|
||||
}
|
||||
|
||||
// Buffer size wrappers for ormqr/unmqr.
|
||||
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||
cublasOperation_t trans, int m, int n, int k, const float* A,
|
||||
int lda, const float* tau, const float* C, int ldc, int* lwork) {
|
||||
return cusolverDnSormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
|
||||
}
|
||||
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||
cublasOperation_t trans, int m, int n, int k, const double* A,
|
||||
int lda, const double* tau, const double* C, int ldc, int* lwork) {
|
||||
return cusolverDnDormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
|
||||
}
|
||||
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||
cublasOperation_t trans, int m, int n, int k,
|
||||
const std::complex<float>* A, int lda,
|
||||
const std::complex<float>* tau, const std::complex<float>* C,
|
||||
int ldc, int* lwork) {
|
||||
return cusolverDnCunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
|
||||
reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<const cuComplex*>(C),
|
||||
ldc, lwork);
|
||||
}
|
||||
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||
cublasOperation_t trans, int m, int n, int k,
|
||||
const std::complex<double>* A, int lda,
|
||||
const std::complex<double>* tau, const std::complex<double>* C,
|
||||
int ldc, int* lwork) {
|
||||
return cusolverDnZunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
|
||||
reinterpret_cast<const cuDoubleComplex*>(tau),
|
||||
reinterpret_cast<const cuDoubleComplex*>(C), ldc, lwork);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace Eigen
|
||||
|
||||
|
||||
@@ -58,8 +58,8 @@ void dispatch_gemm(
|
||||
|
||||
eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch");
|
||||
|
||||
const int64_t lda = A.outerStride();
|
||||
const int64_t ldb = B.outerStride();
|
||||
const int64_t lda = A.rows();
|
||||
const int64_t ldb = B.rows();
|
||||
|
||||
// Serialize all accesses to the destination buffer on this stream.
|
||||
if (!dst.empty()) {
|
||||
@@ -71,9 +71,13 @@ void dispatch_gemm(
|
||||
if (resized) {
|
||||
dst.resize(m, n);
|
||||
}
|
||||
const int64_t ldc = dst.outerStride();
|
||||
const int64_t ldc = dst.rows();
|
||||
|
||||
Scalar alpha_val = alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs());
|
||||
// cuBLAS requires alpha/beta as float for half/bfloat16 inputs.
|
||||
using GemmScalar = typename cuda_gemm_scalar<Scalar>::type;
|
||||
GemmScalar alpha_gval =
|
||||
static_cast<GemmScalar>(alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs()));
|
||||
GemmScalar beta_gval = static_cast<GemmScalar>(beta_val);
|
||||
|
||||
// Wait for operands to be ready on this stream.
|
||||
A.waitReady(ctx.stream());
|
||||
@@ -81,7 +85,7 @@ void dispatch_gemm(
|
||||
|
||||
// If there is no existing valid destination to accumulate into, treat it as
|
||||
// zero rather than reading uninitialized memory.
|
||||
if (resized && beta_val != Scalar(0) && dst.sizeInBytes() > 0) {
|
||||
if (resized && beta_gval != GemmScalar(0) && dst.sizeInBytes() > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
|
||||
}
|
||||
|
||||
@@ -89,9 +93,9 @@ void dispatch_gemm(
|
||||
constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
|
||||
|
||||
EIGEN_CUBLAS_CHECK(cublasGemmEx(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n),
|
||||
static_cast<int>(k), &alpha_val, A.data(), dtype, static_cast<int>(lda), B.data(),
|
||||
dtype, static_cast<int>(ldb), &beta_val, dst.data(), dtype, static_cast<int>(ldc),
|
||||
compute, CUBLAS_GEMM_DEFAULT));
|
||||
static_cast<int>(k), &alpha_gval, A.data(), dtype, static_cast<int>(lda), B.data(),
|
||||
dtype, static_cast<int>(ldb), &beta_gval, dst.data(), dtype, static_cast<int>(ldc),
|
||||
compute, cuda_gemm_algo()));
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
@@ -125,9 +129,9 @@ void dispatch_llt_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LltSol
|
||||
|
||||
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||
constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo, ColMajor>::value;
|
||||
const int64_t lda = static_cast<int64_t>(A.outerStride());
|
||||
const int64_t ldb = static_cast<int64_t>(B.outerStride());
|
||||
eigen_assert(ldb == static_cast<int64_t>(B.rows()) && "DeviceMatrix must be densely packed");
|
||||
const int64_t lda = static_cast<int64_t>(A.rows());
|
||||
const int64_t ldb = static_cast<int64_t>(B.rows());
|
||||
|
||||
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
|
||||
@@ -163,7 +167,7 @@ void dispatch_llt_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LltSol
|
||||
// Solve.
|
||||
DeviceBuffer d_solve_info(sizeof(int));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), nrhs, dtype,
|
||||
d_factor.ptr, lda, dtype, dst.data(), static_cast<int64_t>(dst.outerStride()),
|
||||
d_factor.ptr, lda, dtype, dst.data(), static_cast<int64_t>(dst.rows()),
|
||||
static_cast<int*>(d_solve_info.ptr)));
|
||||
|
||||
// Sync to ensure workspace locals can be freed safely.
|
||||
@@ -201,9 +205,9 @@ void dispatch_lu_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LuSolve
|
||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||
|
||||
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||
const int64_t lda = static_cast<int64_t>(A.outerStride());
|
||||
const int64_t ldb = static_cast<int64_t>(B.outerStride());
|
||||
eigen_assert(ldb == static_cast<int64_t>(B.rows()) && "DeviceMatrix must be densely packed");
|
||||
const int64_t lda = static_cast<int64_t>(A.rows());
|
||||
const int64_t ldb = static_cast<int64_t>(B.rows());
|
||||
|
||||
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t);
|
||||
@@ -245,7 +249,7 @@ void dispatch_lu_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LuSolve
|
||||
DeviceBuffer d_solve_info(sizeof(int));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, static_cast<int64_t>(n), nrhs,
|
||||
dtype, d_lu.ptr, lda, static_cast<const int64_t*>(d_ipiv.ptr), dtype,
|
||||
dst.data(), static_cast<int64_t>(dst.outerStride()),
|
||||
dst.data(), static_cast<int64_t>(dst.rows()),
|
||||
static_cast<int*>(d_solve_info.ptr)));
|
||||
|
||||
// Sync to ensure workspace locals can be freed safely.
|
||||
@@ -285,15 +289,15 @@ void dispatch_trsm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const TrsmExpr<Sc
|
||||
|
||||
// D2D copy B → dst (trsm is in-place on the RHS).
|
||||
dst.resize(n, B.cols());
|
||||
const size_t rhs_bytes = static_cast<size_t>(dst.outerStride()) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
const size_t rhs_bytes = static_cast<size_t>(dst.rows()) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||
|
||||
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
||||
Scalar alpha(1);
|
||||
|
||||
EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs,
|
||||
&alpha, A.data(), static_cast<int>(A.outerStride()), dst.data(),
|
||||
static_cast<int>(dst.outerStride())));
|
||||
&alpha, A.data(), static_cast<int>(A.rows()), dst.data(),
|
||||
static_cast<int>(dst.rows())));
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
@@ -329,8 +333,8 @@ void dispatch_symm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SymmExpr<Sc
|
||||
Scalar alpha(1), beta(0);
|
||||
|
||||
EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &alpha, A.data(),
|
||||
static_cast<int>(A.outerStride()), B.data(), static_cast<int>(B.outerStride()), &beta,
|
||||
dst.data(), static_cast<int>(dst.outerStride())));
|
||||
static_cast<int>(A.rows()), B.data(), static_cast<int>(B.rows()), &beta, dst.data(),
|
||||
static_cast<int>(dst.rows())));
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
@@ -367,8 +371,7 @@ void dispatch_syrk(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SyrkExpr<Sc
|
||||
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
||||
|
||||
EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(),
|
||||
static_cast<int>(A.outerStride()), &beta_val, dst.data(),
|
||||
static_cast<int>(dst.outerStride())));
|
||||
static_cast<int>(A.rows()), &beta_val, dst.data(), static_cast<int>(dst.rows())));
|
||||
|
||||
dst.recordReady(ctx.stream());
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
// Typed RAII wrapper for a dense matrix in GPU device memory.
|
||||
//
|
||||
// DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked
|
||||
// dimensions and leading dimension. It can be passed to GPU solvers
|
||||
// dimensions. Always dense (leading dimension = rows). It can be passed to GPU solvers
|
||||
// (GpuLLT, GpuLU, future cuBLAS/cuDSS) without host round-trips.
|
||||
//
|
||||
// Cross-stream safety is automatic: an internal CUDA event tracks when the
|
||||
@@ -25,7 +25,7 @@
|
||||
// MatrixXd X = d_X.toHost(); // download + block
|
||||
//
|
||||
// Async variants:
|
||||
// auto d_A = DeviceMatrix<double>::fromHostAsync(A.data(), n, n, n, stream);
|
||||
// auto d_A = DeviceMatrix<double>::fromHostAsync(A.data(), n, n, stream);
|
||||
// auto transfer = d_X.toHostAsync(stream); // enqueue D2H
|
||||
// // ... overlap with other work ...
|
||||
// MatrixXd X = transfer.get(); // block + retrieve
|
||||
@@ -157,7 +157,8 @@ class HostTransfer {
|
||||
*
|
||||
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
|
||||
*
|
||||
* Owns a device allocation with tracked dimensions and leading dimension.
|
||||
* Owns a device allocation with tracked dimensions. Always dense
|
||||
* (leading dimension = rows; no stride padding).
|
||||
* An internal CUDA event records when the data was last written, enabling
|
||||
* safe cross-stream consumption without user-visible synchronization.
|
||||
*
|
||||
@@ -177,7 +178,7 @@ class DeviceMatrix {
|
||||
DeviceMatrix() = default;
|
||||
|
||||
/** Allocate uninitialized device memory for a rows x cols matrix. */
|
||||
DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols), outerStride_(rows) {
|
||||
DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols) {
|
||||
eigen_assert(rows >= 0 && cols >= 0);
|
||||
size_t bytes = sizeInBytes();
|
||||
if (bytes > 0) {
|
||||
@@ -196,14 +197,12 @@ class DeviceMatrix {
|
||||
: data_(o.data_),
|
||||
rows_(o.rows_),
|
||||
cols_(o.cols_),
|
||||
outerStride_(o.outerStride_),
|
||||
ready_event_(o.ready_event_),
|
||||
ready_stream_(o.ready_stream_),
|
||||
retained_buffer_(std::move(o.retained_buffer_)) {
|
||||
o.data_ = nullptr;
|
||||
o.rows_ = 0;
|
||||
o.cols_ = 0;
|
||||
o.outerStride_ = 0;
|
||||
o.ready_event_ = nullptr;
|
||||
o.ready_stream_ = nullptr;
|
||||
}
|
||||
@@ -215,14 +214,12 @@ class DeviceMatrix {
|
||||
data_ = o.data_;
|
||||
rows_ = o.rows_;
|
||||
cols_ = o.cols_;
|
||||
outerStride_ = o.outerStride_;
|
||||
ready_event_ = o.ready_event_;
|
||||
ready_stream_ = o.ready_stream_;
|
||||
retained_buffer_ = std::move(o.retained_buffer_);
|
||||
o.data_ = nullptr;
|
||||
o.rows_ = 0;
|
||||
o.cols_ = 0;
|
||||
o.outerStride_ = 0;
|
||||
o.ready_event_ = nullptr;
|
||||
o.ready_stream_ = nullptr;
|
||||
}
|
||||
@@ -259,29 +256,17 @@ class DeviceMatrix {
|
||||
* The caller must keep \p host_data alive until the transfer completes
|
||||
* (check via the internal event or synchronize the stream).
|
||||
*
|
||||
* \param host_data Pointer to contiguous column-major host data.
|
||||
* \param rows Number of rows.
|
||||
* \param cols Number of columns.
|
||||
* \param outerStride Leading dimension (>= rows). Use rows for dense.
|
||||
* \param stream CUDA stream for the transfer.
|
||||
* \param host_data Pointer to contiguous column-major host data.
|
||||
* \param rows Number of rows.
|
||||
* \param cols Number of columns.
|
||||
* \param stream CUDA stream for the transfer.
|
||||
*/
|
||||
static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, Index outerStride,
|
||||
cudaStream_t stream) {
|
||||
eigen_assert(rows >= 0 && cols >= 0 && outerStride >= rows);
|
||||
static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, cudaStream_t stream) {
|
||||
eigen_assert(rows >= 0 && cols >= 0);
|
||||
eigen_assert(host_data != nullptr || (rows == 0 || cols == 0));
|
||||
DeviceMatrix dm(rows, cols);
|
||||
if (dm.sizeInBytes() > 0) {
|
||||
// If outerStride == rows (dense), single contiguous copy.
|
||||
// Otherwise, copy column by column (strided layout).
|
||||
if (outerStride == rows) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(dm.data_, host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
|
||||
} else {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(dm.data_, static_cast<size_t>(rows) * sizeof(Scalar), host_data,
|
||||
static_cast<size_t>(outerStride) * sizeof(Scalar),
|
||||
static_cast<size_t>(rows) * sizeof(Scalar),
|
||||
static_cast<size_t>(cols), cudaMemcpyHostToDevice, stream));
|
||||
}
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dm.data_, host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
|
||||
dm.recordReady(stream);
|
||||
}
|
||||
return dm;
|
||||
@@ -360,7 +345,6 @@ class DeviceMatrix {
|
||||
retained_buffer_ = internal::DeviceBuffer();
|
||||
rows_ = rows;
|
||||
cols_ = cols;
|
||||
outerStride_ = rows;
|
||||
size_t bytes = sizeInBytes();
|
||||
if (bytes > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
|
||||
@@ -373,11 +357,10 @@ class DeviceMatrix {
|
||||
const Scalar* data() const { return data_; }
|
||||
Index rows() const { return rows_; }
|
||||
Index cols() const { return cols_; }
|
||||
Index outerStride() const { return outerStride_; }
|
||||
bool empty() const { return rows_ == 0 || cols_ == 0; }
|
||||
|
||||
/** Size of the device allocation in bytes. */
|
||||
size_t sizeInBytes() const { return static_cast<size_t>(outerStride_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
|
||||
size_t sizeInBytes() const { return static_cast<size_t>(rows_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
|
||||
|
||||
// ---- Event synchronization (public for library dispatch interop) ---------
|
||||
|
||||
@@ -466,8 +449,7 @@ class DeviceMatrix {
|
||||
private:
|
||||
// ---- Private: adopt a raw device pointer (used by friend solvers) --------
|
||||
|
||||
DeviceMatrix(Scalar* device_ptr, Index rows, Index cols, Index outerStride)
|
||||
: data_(device_ptr), rows_(rows), cols_(cols), outerStride_(outerStride) {}
|
||||
DeviceMatrix(Scalar* device_ptr, Index rows, Index cols) : data_(device_ptr), rows_(rows), cols_(cols) {}
|
||||
|
||||
/** Transfer ownership of the device pointer out. Zeros internal state. */
|
||||
Scalar* release() {
|
||||
@@ -475,7 +457,6 @@ class DeviceMatrix {
|
||||
data_ = nullptr;
|
||||
rows_ = 0;
|
||||
cols_ = 0;
|
||||
outerStride_ = 0;
|
||||
if (ready_event_) {
|
||||
(void)cudaEventDestroy(ready_event_);
|
||||
ready_event_ = nullptr;
|
||||
@@ -500,13 +481,18 @@ class DeviceMatrix {
|
||||
friend class GpuLLT;
|
||||
template <typename>
|
||||
friend class GpuLU;
|
||||
template <typename>
|
||||
friend class GpuQR;
|
||||
template <typename>
|
||||
friend class GpuSVD;
|
||||
template <typename>
|
||||
friend class GpuSelfAdjointEigenSolver;
|
||||
|
||||
// ---- Data members --------------------------------------------------------
|
||||
|
||||
Scalar* data_ = nullptr;
|
||||
Index rows_ = 0;
|
||||
Index cols_ = 0;
|
||||
Index outerStride_ = 0;
|
||||
cudaEvent_t ready_event_ = nullptr; // internal: tracks last write completion
|
||||
cudaStream_t ready_stream_ = nullptr; // stream that recorded ready_event_ (for same-stream skip)
|
||||
internal::DeviceBuffer retained_buffer_; // internal: keeps async aux buffers alive
|
||||
|
||||
228
Eigen/src/GPU/GpuEigenSolver.h
Normal file
228
Eigen/src/GPU/GpuEigenSolver.h
Normal file
@@ -0,0 +1,228 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU self-adjoint eigenvalue decomposition using cuSOLVER.
|
||||
//
|
||||
// Wraps cusolverDnXsyevd (symmetric/Hermitian divide-and-conquer).
|
||||
// Stores eigenvalues and eigenvectors on device.
|
||||
//
|
||||
// Usage:
|
||||
// GpuSelfAdjointEigenSolver<double> es(A);
|
||||
// VectorXd eigenvals = es.eigenvalues();
|
||||
// MatrixXd eigenvecs = es.eigenvectors();
|
||||
|
||||
#ifndef EIGEN_GPU_EIGENSOLVER_H
|
||||
#define EIGEN_GPU_EIGENSOLVER_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuSolverSupport.h"
|
||||
#include <vector>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template <typename Scalar_>
|
||||
class GpuSelfAdjointEigenSolver {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
using RealVector = Matrix<RealScalar, Dynamic, 1>;
|
||||
|
||||
/** Eigenvalue-only or eigenvalues + eigenvectors. */
|
||||
enum ComputeMode { EigenvaluesOnly, ComputeEigenvectors };
|
||||
|
||||
GpuSelfAdjointEigenSolver() { init_context(); }
|
||||
|
||||
template <typename InputType>
|
||||
explicit GpuSelfAdjointEigenSolver(const EigenBase<InputType>& A, ComputeMode mode = ComputeEigenvectors) {
|
||||
init_context();
|
||||
compute(A, mode);
|
||||
}
|
||||
|
||||
~GpuSelfAdjointEigenSolver() {
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
GpuSelfAdjointEigenSolver(const GpuSelfAdjointEigenSolver&) = delete;
|
||||
GpuSelfAdjointEigenSolver& operator=(const GpuSelfAdjointEigenSolver&) = delete;
|
||||
|
||||
// ---- Factorization -------------------------------------------------------
|
||||
|
||||
template <typename InputType>
|
||||
GpuSelfAdjointEigenSolver& compute(const EigenBase<InputType>& A, ComputeMode mode = ComputeEigenvectors) {
|
||||
eigen_assert(A.rows() == A.cols() && "GpuSelfAdjointEigenSolver requires a square matrix");
|
||||
mode_ = mode;
|
||||
n_ = A.rows();
|
||||
info_ = InvalidInput;
|
||||
info_synced_ = false;
|
||||
|
||||
if (n_ == 0) {
|
||||
info_ = Success;
|
||||
info_synced_ = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const PlainMatrix mat(A.derived());
|
||||
lda_ = static_cast<int64_t>(n_);
|
||||
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||
|
||||
// syevd overwrites A with eigenvectors (if requested).
|
||||
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
GpuSelfAdjointEigenSolver& compute(const DeviceMatrix<Scalar>& d_A, ComputeMode mode = ComputeEigenvectors) {
|
||||
eigen_assert(d_A.rows() == d_A.cols() && "GpuSelfAdjointEigenSolver requires a square matrix");
|
||||
mode_ = mode;
|
||||
n_ = d_A.rows();
|
||||
info_ = InvalidInput;
|
||||
info_synced_ = false;
|
||||
|
||||
if (n_ == 0) {
|
||||
info_ = Success;
|
||||
info_synced_ = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
d_A.waitReady(stream_);
|
||||
lda_ = static_cast<int64_t>(n_);
|
||||
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||
|
||||
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Accessors -----------------------------------------------------------
|
||||
|
||||
ComputationInfo info() const {
|
||||
sync_info();
|
||||
return info_;
|
||||
}
|
||||
|
||||
Index cols() const { return n_; }
|
||||
Index rows() const { return n_; }
|
||||
|
||||
/** Eigenvalues in ascending order. Downloads from device. */
|
||||
RealVector eigenvalues() const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success);
|
||||
RealVector W(n_);
|
||||
if (n_ > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpy(W.data(), d_W_.ptr, static_cast<size_t>(n_) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
|
||||
}
|
||||
return W;
|
||||
}
|
||||
|
||||
/** Eigenvectors (columns). Downloads from device.
|
||||
* Requires ComputeEigenvectors mode. */
|
||||
PlainMatrix eigenvectors() const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success);
|
||||
eigen_assert(mode_ == ComputeEigenvectors && "eigenvectors() requires ComputeEigenvectors mode");
|
||||
PlainMatrix V(n_, n_);
|
||||
if (n_ > 0) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(V.data(), d_A_.ptr,
|
||||
static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost));
|
||||
}
|
||||
return V;
|
||||
}
|
||||
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cusolverDnHandle_t handle_ = nullptr;
|
||||
internal::CusolverParams params_;
|
||||
internal::DeviceBuffer d_A_; // overwritten with eigenvectors by syevd
|
||||
internal::DeviceBuffer d_W_; // eigenvalues (RealScalar, length n)
|
||||
internal::DeviceBuffer d_scratch_; // workspace + info
|
||||
size_t scratch_size_ = 0;
|
||||
std::vector<char> h_workspace_;
|
||||
ComputeMode mode_ = ComputeEigenvectors;
|
||||
Index n_ = 0;
|
||||
int64_t lda_ = 0;
|
||||
ComputationInfo info_ = InvalidInput;
|
||||
int info_word_ = 0;
|
||||
bool info_synced_ = true;
|
||||
|
||||
void init_context() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||
ensure_scratch(0);
|
||||
}
|
||||
|
||||
void ensure_scratch(size_t workspace_bytes) {
|
||||
constexpr size_t kAlign = 16;
|
||||
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||
size_t needed = workspace_bytes + sizeof(int);
|
||||
if (needed > scratch_size_) {
|
||||
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
d_scratch_ = internal::DeviceBuffer(needed);
|
||||
scratch_size_ = needed;
|
||||
}
|
||||
}
|
||||
|
||||
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||
int* scratch_info() const {
|
||||
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||
}
|
||||
|
||||
void sync_info() const {
|
||||
if (!info_synced_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
const_cast<GpuSelfAdjointEigenSolver*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||
const_cast<GpuSelfAdjointEigenSolver*>(this)->info_synced_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
void factorize() {
|
||||
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||
constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
|
||||
|
||||
info_synced_ = false;
|
||||
info_ = InvalidInput;
|
||||
|
||||
d_W_ = internal::DeviceBuffer(static_cast<size_t>(n_) * sizeof(RealScalar));
|
||||
|
||||
const cusolverEigMode_t jobz =
|
||||
(mode_ == ComputeEigenvectors) ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
|
||||
|
||||
// Use lower triangle (standard convention).
|
||||
constexpr cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
|
||||
|
||||
size_t dev_ws = 0, host_ws = 0;
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd_bufferSize(handle_, params_.p, jobz, uplo, static_cast<int64_t>(n_), dtype,
|
||||
d_A_.ptr, lda_, rtype, d_W_.ptr, dtype, &dev_ws, &host_ws));
|
||||
|
||||
ensure_scratch(dev_ws);
|
||||
h_workspace_.resize(host_ws);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd(handle_, params_.p, jobz, uplo, static_cast<int64_t>(n_), dtype, d_A_.ptr,
|
||||
lda_, rtype, d_W_.ptr, dtype, scratch_workspace(), dev_ws,
|
||||
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_EIGENSOLVER_H
|
||||
@@ -149,7 +149,7 @@ class GpuLLT {
|
||||
|
||||
// Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions).
|
||||
const PlainMatrix mat(A.derived());
|
||||
lda_ = static_cast<int64_t>(mat.outerStride());
|
||||
lda_ = static_cast<int64_t>(mat.rows());
|
||||
allocate_factor_storage();
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_factor_.ptr, mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_));
|
||||
@@ -163,7 +163,7 @@ class GpuLLT {
|
||||
eigen_assert(d_A.rows() == d_A.cols());
|
||||
if (!begin_compute(d_A.rows())) return *this;
|
||||
|
||||
lda_ = static_cast<int64_t>(d_A.outerStride());
|
||||
lda_ = static_cast<int64_t>(d_A.rows());
|
||||
d_A.waitReady(stream_);
|
||||
allocate_factor_storage();
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
@@ -178,7 +178,7 @@ class GpuLLT {
|
||||
eigen_assert(d_A.rows() == d_A.cols());
|
||||
if (!begin_compute(d_A.rows())) return *this;
|
||||
|
||||
lda_ = static_cast<int64_t>(d_A.outerStride());
|
||||
lda_ = static_cast<int64_t>(d_A.rows());
|
||||
d_A.waitReady(stream_);
|
||||
d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
|
||||
|
||||
@@ -205,7 +205,7 @@ class GpuLLT {
|
||||
|
||||
const PlainMatrix rhs(B);
|
||||
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
|
||||
const int64_t ldb = static_cast<int64_t>(rhs.rows());
|
||||
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_x_ptr, rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
|
||||
@@ -234,7 +234,7 @@ class GpuLLT {
|
||||
eigen_assert(d_B.rows() == n_);
|
||||
d_B.waitReady(stream_);
|
||||
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
|
||||
const int64_t ldb = static_cast<int64_t>(d_B.rows());
|
||||
return solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_x_ptr, d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
|
||||
@@ -332,7 +332,7 @@ class GpuLLT {
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, static_cast<int64_t>(n_), nrhs, dtype,
|
||||
d_factor_.ptr, lda_, dtype, d_x_ptr, ldb, scratch_info()));
|
||||
|
||||
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs), static_cast<Index>(ldb));
|
||||
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs));
|
||||
result.recordReady(stream_);
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -140,7 +140,7 @@ class GpuLU {
|
||||
if (!begin_compute(A.rows())) return *this;
|
||||
|
||||
const PlainMatrix mat(A.derived());
|
||||
lda_ = static_cast<int64_t>(mat.outerStride());
|
||||
lda_ = static_cast<int64_t>(mat.rows());
|
||||
allocate_lu_storage();
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
@@ -153,7 +153,7 @@ class GpuLU {
|
||||
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
|
||||
if (!begin_compute(d_A.rows())) return *this;
|
||||
|
||||
lda_ = static_cast<int64_t>(d_A.outerStride());
|
||||
lda_ = static_cast<int64_t>(d_A.rows());
|
||||
d_A.waitReady(stream_);
|
||||
allocate_lu_storage();
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_));
|
||||
@@ -167,7 +167,7 @@ class GpuLU {
|
||||
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
|
||||
if (!begin_compute(d_A.rows())) return *this;
|
||||
|
||||
lda_ = static_cast<int64_t>(d_A.outerStride());
|
||||
lda_ = static_cast<int64_t>(d_A.rows());
|
||||
d_A.waitReady(stream_);
|
||||
d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
|
||||
|
||||
@@ -190,7 +190,7 @@ class GpuLU {
|
||||
|
||||
const PlainMatrix rhs(B);
|
||||
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
|
||||
const int64_t ldb = static_cast<int64_t>(rhs.rows());
|
||||
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_x_ptr, rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
|
||||
@@ -213,7 +213,7 @@ class GpuLU {
|
||||
eigen_assert(d_B.rows() == n_);
|
||||
d_B.waitReady(stream_);
|
||||
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
|
||||
const int64_t ldb = static_cast<int64_t>(d_B.rows());
|
||||
return solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(d_x_ptr, d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
|
||||
@@ -305,7 +305,7 @@ class GpuLU {
|
||||
lda_, static_cast<const int64_t*>(d_ipiv_.ptr), dtype, d_x_ptr, ldb,
|
||||
scratch_info()));
|
||||
|
||||
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs), static_cast<Index>(ldb));
|
||||
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs));
|
||||
result.recordReady(stream_);
|
||||
return result;
|
||||
}
|
||||
|
||||
386
Eigen/src/GPU/GpuQR.h
Normal file
386
Eigen/src/GPU/GpuQR.h
Normal file
@@ -0,0 +1,386 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU QR decomposition using cuSOLVER.
|
||||
//
|
||||
// Wraps cusolverDnXgeqrf (factorization), cusolverDnXormqr (apply Q),
|
||||
// cusolverDnXorgqr (form Q), and cublasXtrsm (triangular solve on R).
|
||||
//
|
||||
// The factored matrix (reflectors + R) and tau stay in device memory.
|
||||
// Solve uses ormqr + trsm without forming Q explicitly.
|
||||
//
|
||||
// Usage:
|
||||
// GpuQR<double> qr(A); // upload A, geqrf
|
||||
// if (qr.info() != Success) { ... }
|
||||
// MatrixXd X = qr.solve(B); // Q^H * B via ormqr, then trsm on R
|
||||
//
|
||||
// Expression syntax:
|
||||
// d_X = d_A.qr().solve(d_B); // temporary, no caching
|
||||
|
||||
#ifndef EIGEN_GPU_QR_H
|
||||
#define EIGEN_GPU_QR_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuSolverSupport.h"
|
||||
#include "./CuBlasSupport.h"
|
||||
#include <vector>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template <typename Scalar_>
|
||||
class GpuQR {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
|
||||
GpuQR() { init_context(); }
|
||||
|
||||
template <typename InputType>
|
||||
explicit GpuQR(const EigenBase<InputType>& A) {
|
||||
init_context();
|
||||
compute(A);
|
||||
}
|
||||
|
||||
~GpuQR() {
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (cublas_) (void)cublasDestroy(cublas_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
GpuQR(const GpuQR&) = delete;
|
||||
GpuQR& operator=(const GpuQR&) = delete;
|
||||
|
||||
GpuQR(GpuQR&& o) noexcept
|
||||
: stream_(o.stream_),
|
||||
handle_(o.handle_),
|
||||
cublas_(o.cublas_),
|
||||
params_(std::move(o.params_)),
|
||||
d_qr_(std::move(o.d_qr_)),
|
||||
d_tau_(std::move(o.d_tau_)),
|
||||
d_scratch_(std::move(o.d_scratch_)),
|
||||
scratch_size_(o.scratch_size_),
|
||||
h_workspace_(std::move(o.h_workspace_)),
|
||||
m_(o.m_),
|
||||
n_(o.n_),
|
||||
lda_(o.lda_),
|
||||
info_(o.info_),
|
||||
info_word_(o.info_word_),
|
||||
info_synced_(o.info_synced_) {
|
||||
o.stream_ = nullptr;
|
||||
o.handle_ = nullptr;
|
||||
o.cublas_ = nullptr;
|
||||
o.scratch_size_ = 0;
|
||||
o.m_ = 0;
|
||||
o.n_ = 0;
|
||||
o.lda_ = 0;
|
||||
o.info_ = InvalidInput;
|
||||
o.info_word_ = 0;
|
||||
o.info_synced_ = true;
|
||||
}
|
||||
|
||||
GpuQR& operator=(GpuQR&& o) noexcept {
|
||||
if (this != &o) {
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (cublas_) (void)cublasDestroy(cublas_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
stream_ = o.stream_;
|
||||
handle_ = o.handle_;
|
||||
cublas_ = o.cublas_;
|
||||
params_ = std::move(o.params_);
|
||||
d_qr_ = std::move(o.d_qr_);
|
||||
d_tau_ = std::move(o.d_tau_);
|
||||
d_scratch_ = std::move(o.d_scratch_);
|
||||
scratch_size_ = o.scratch_size_;
|
||||
h_workspace_ = std::move(o.h_workspace_);
|
||||
m_ = o.m_;
|
||||
n_ = o.n_;
|
||||
lda_ = o.lda_;
|
||||
info_ = o.info_;
|
||||
info_word_ = o.info_word_;
|
||||
info_synced_ = o.info_synced_;
|
||||
o.stream_ = nullptr;
|
||||
o.handle_ = nullptr;
|
||||
o.cublas_ = nullptr;
|
||||
o.scratch_size_ = 0;
|
||||
o.m_ = 0;
|
||||
o.n_ = 0;
|
||||
o.lda_ = 0;
|
||||
o.info_ = InvalidInput;
|
||||
o.info_word_ = 0;
|
||||
o.info_synced_ = true;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Factorization -------------------------------------------------------
|
||||
|
||||
template <typename InputType>
|
||||
GpuQR& compute(const EigenBase<InputType>& A) {
|
||||
m_ = A.rows();
|
||||
n_ = A.cols();
|
||||
info_ = InvalidInput;
|
||||
info_synced_ = false;
|
||||
|
||||
if (m_ == 0 || n_ == 0) {
|
||||
info_ = Success;
|
||||
info_synced_ = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
const PlainMatrix mat(A.derived());
|
||||
lda_ = static_cast<int64_t>(mat.rows());
|
||||
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||
const size_t tau_bytes = static_cast<size_t>((std::min)(m_, n_)) * sizeof(Scalar);
|
||||
|
||||
d_qr_ = internal::DeviceBuffer(mat_bytes);
|
||||
d_tau_ = internal::DeviceBuffer(tau_bytes);
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_qr_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
GpuQR& compute(const DeviceMatrix<Scalar>& d_A) {
|
||||
m_ = d_A.rows();
|
||||
n_ = d_A.cols();
|
||||
info_ = InvalidInput;
|
||||
info_synced_ = false;
|
||||
|
||||
if (m_ == 0 || n_ == 0) {
|
||||
info_ = Success;
|
||||
info_synced_ = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
lda_ = static_cast<int64_t>(d_A.rows());
|
||||
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||
const size_t tau_bytes = static_cast<size_t>((std::min)(m_, n_)) * sizeof(Scalar);
|
||||
|
||||
d_A.waitReady(stream_);
|
||||
d_qr_ = internal::DeviceBuffer(mat_bytes);
|
||||
d_tau_ = internal::DeviceBuffer(tau_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_qr_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Solve ---------------------------------------------------------------
|
||||
|
||||
/** Solve A * X = B via QR: X = R^{-1} * Q^H * B (least-squares for m >= n).
|
||||
* Uses ormqr (apply Q^H) + trsm (solve R), without forming Q explicitly.
|
||||
* Requires m >= n (overdetermined or square). Underdetermined not supported. */
|
||||
template <typename Rhs>
|
||||
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success && "GpuQR::solve called on a failed or uninitialized factorization");
|
||||
eigen_assert(B.rows() == m_);
|
||||
eigen_assert(m_ >= n_ && "GpuQR::solve requires m >= n (use SVD for underdetermined systems)");
|
||||
|
||||
const PlainMatrix rhs(B);
|
||||
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(rhs.rows()); // = m_
|
||||
const size_t b_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
|
||||
// Upload B to device (m × nrhs buffer).
|
||||
internal::DeviceBuffer d_B(b_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.ptr, rhs.data(), b_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
// Apply Q^H to B in-place: d_B becomes m × nrhs, first n rows hold Q^H * B relevant part.
|
||||
apply_QH(d_B.ptr, ldb, nrhs);
|
||||
|
||||
// Solve R * X = (Q^H * B)[0:n,:] via trsm on the first n rows.
|
||||
Scalar alpha(1);
|
||||
EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
|
||||
CUBLAS_DIAG_NON_UNIT, static_cast<int>(n_), static_cast<int>(nrhs), &alpha,
|
||||
static_cast<const Scalar*>(d_qr_.ptr), static_cast<int>(lda_),
|
||||
static_cast<Scalar*>(d_B.ptr), static_cast<int>(ldb)));
|
||||
|
||||
// Download the first n rows of each column (stride = ldb = m, width = n).
|
||||
PlainMatrix X(n_, rhs.cols());
|
||||
if (m_ == n_) {
|
||||
// Square: dense copy, no stride mismatch.
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_B.ptr,
|
||||
static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost, stream_));
|
||||
} else {
|
||||
// Overdetermined: 2D copy to extract first n rows from each column.
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(
|
||||
X.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_B.ptr, static_cast<size_t>(ldb) * sizeof(Scalar),
|
||||
static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs), cudaMemcpyDeviceToHost, stream_));
|
||||
}
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
return X;
|
||||
}
|
||||
|
||||
/** Solve with device-resident RHS. Returns n × nrhs DeviceMatrix. */
|
||||
DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success && "GpuQR::solve called on a failed or uninitialized factorization");
|
||||
eigen_assert(d_B.rows() == m_);
|
||||
eigen_assert(m_ >= n_ && "GpuQR::solve requires m >= n (use SVD for underdetermined systems)");
|
||||
d_B.waitReady(stream_);
|
||||
|
||||
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
||||
const int64_t ldb = static_cast<int64_t>(d_B.rows()); // = m_
|
||||
const size_t b_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||
|
||||
// D2D copy B into working buffer (ormqr and trsm are in-place).
|
||||
internal::DeviceBuffer d_work(b_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_work.ptr, d_B.data(), b_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||
|
||||
apply_QH(d_work.ptr, ldb, nrhs);
|
||||
|
||||
// trsm on the first n rows.
|
||||
Scalar alpha(1);
|
||||
EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
|
||||
CUBLAS_DIAG_NON_UNIT, static_cast<int>(n_), static_cast<int>(nrhs), &alpha,
|
||||
static_cast<const Scalar*>(d_qr_.ptr), static_cast<int>(lda_),
|
||||
static_cast<Scalar*>(d_work.ptr), static_cast<int>(ldb)));
|
||||
|
||||
if (m_ == n_) {
|
||||
// Square: result is the whole buffer, dense.
|
||||
DeviceMatrix<Scalar> result(static_cast<Scalar*>(d_work.ptr), n_, static_cast<Index>(nrhs));
|
||||
d_work.ptr = nullptr; // transfer ownership
|
||||
result.recordReady(stream_);
|
||||
return result;
|
||||
} else {
|
||||
// Overdetermined: copy first n rows of each column into a dense n × nrhs result.
|
||||
DeviceMatrix<Scalar> result(n_, static_cast<Index>(nrhs));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(result.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_work.ptr,
|
||||
static_cast<size_t>(ldb) * sizeof(Scalar),
|
||||
static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs),
|
||||
cudaMemcpyDeviceToDevice, stream_));
|
||||
result.recordReady(stream_);
|
||||
return result;
|
||||
// d_work freed here via RAII — safe because stream is ordered.
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Accessors -----------------------------------------------------------
|
||||
|
||||
ComputationInfo info() const {
|
||||
sync_info();
|
||||
return info_;
|
||||
}
|
||||
|
||||
Index rows() const { return m_; }
|
||||
Index cols() const { return n_; }
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cusolverDnHandle_t handle_ = nullptr;
|
||||
cublasHandle_t cublas_ = nullptr;
|
||||
internal::CusolverParams params_;
|
||||
internal::DeviceBuffer d_qr_; // QR factors (reflectors in lower, R in upper)
|
||||
internal::DeviceBuffer d_tau_; // Householder scalars (min(m,n))
|
||||
internal::DeviceBuffer d_scratch_; // workspace + info word
|
||||
size_t scratch_size_ = 0;
|
||||
std::vector<char> h_workspace_;
|
||||
Index m_ = 0;
|
||||
Index n_ = 0;
|
||||
int64_t lda_ = 0;
|
||||
ComputationInfo info_ = InvalidInput;
|
||||
int info_word_ = 0;
|
||||
bool info_synced_ = true;
|
||||
|
||||
void init_context() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
|
||||
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
|
||||
ensure_scratch(0);
|
||||
}
|
||||
|
||||
void ensure_scratch(size_t workspace_bytes) {
|
||||
constexpr size_t kAlign = 16;
|
||||
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||
size_t needed = workspace_bytes + sizeof(int);
|
||||
if (needed > scratch_size_) {
|
||||
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
d_scratch_ = internal::DeviceBuffer(needed);
|
||||
scratch_size_ = needed;
|
||||
}
|
||||
}
|
||||
|
||||
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||
int* scratch_info() const {
|
||||
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||
}
|
||||
|
||||
void sync_info() const {
|
||||
if (!info_synced_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
const_cast<GpuQR*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||
const_cast<GpuQR*>(this)->info_synced_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
void factorize() {
|
||||
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||
|
||||
info_synced_ = false;
|
||||
info_ = InvalidInput;
|
||||
|
||||
size_t dev_ws = 0, host_ws = 0;
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf_bufferSize(handle_, params_.p, static_cast<int64_t>(m_),
|
||||
static_cast<int64_t>(n_), dtype, d_qr_.ptr, lda_, dtype,
|
||||
d_tau_.ptr, dtype, &dev_ws, &host_ws));
|
||||
|
||||
ensure_scratch(dev_ws);
|
||||
h_workspace_.resize(host_ws);
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf(handle_, params_.p, static_cast<int64_t>(m_), static_cast<int64_t>(n_), dtype,
|
||||
d_qr_.ptr, lda_, dtype, d_tau_.ptr, dtype, scratch_workspace(), dev_ws,
|
||||
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||
}
|
||||
|
||||
// Apply Q^H to a device buffer in-place: d_B = Q^H * d_B.
|
||||
// Uses type-specific ormqr (real) or unmqr (complex) wrappers from CuSolverSupport.h.
|
||||
// For real types: Q^H = Q^T, use CUBLAS_OP_T. For complex: use CUBLAS_OP_C.
|
||||
void apply_QH(void* d_B, int64_t ldb, int64_t nrhs) const {
|
||||
const int im = static_cast<int>(m_);
|
||||
const int in = static_cast<int>(nrhs);
|
||||
const int ik = static_cast<int>((std::min)(m_, n_));
|
||||
const int ilda = static_cast<int>(lda_);
|
||||
const int ildb = static_cast<int>(ldb);
|
||||
constexpr cublasOperation_t trans = NumTraits<Scalar>::IsComplex ? CUBLAS_OP_C : CUBLAS_OP_T;
|
||||
|
||||
int lwork = 0;
|
||||
EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr_bufferSize(
|
||||
handle_, CUBLAS_SIDE_LEFT, trans, im, in, ik, static_cast<const Scalar*>(d_qr_.ptr), ilda,
|
||||
static_cast<const Scalar*>(d_tau_.ptr), static_cast<const Scalar*>(d_B), ildb, &lwork));
|
||||
|
||||
internal::DeviceBuffer d_work(static_cast<size_t>(lwork) * sizeof(Scalar));
|
||||
|
||||
EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr(handle_, CUBLAS_SIDE_LEFT, trans, im, in, ik,
|
||||
static_cast<const Scalar*>(d_qr_.ptr), ilda,
|
||||
static_cast<const Scalar*>(d_tau_.ptr), static_cast<Scalar*>(d_B),
|
||||
ildb, static_cast<Scalar*>(d_work.ptr), lwork, scratch_info()));
|
||||
|
||||
// Sync to ensure workspace can be freed safely, and check ormqr info.
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
int ormqr_info = 0;
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(&ormqr_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost));
|
||||
eigen_assert(ormqr_info == 0 && "cusolverDnXormqr reported an error");
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_QR_H
|
||||
486
Eigen/src/GPU/GpuSVD.h
Normal file
486
Eigen/src/GPU/GpuSVD.h
Normal file
@@ -0,0 +1,486 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// GPU SVD decomposition using cuSOLVER (divide-and-conquer).
|
||||
//
|
||||
// Wraps cusolverDnXgesvd. Stores U, S, VT on device. Solve uses
|
||||
// cuBLAS GEMM: X = VT^H * diag(D) * U^H * B.
|
||||
//
|
||||
// cuSOLVER returns VT (not V). We store and expose VT directly.
|
||||
//
|
||||
// Usage:
|
||||
// GpuSVD<double> svd(A, ComputeThinU | ComputeThinV);
|
||||
// VectorXd S = svd.singularValues();
|
||||
// MatrixXd U = svd.matrixU(); // m×k or m×m
|
||||
// MatrixXd VT = svd.matrixVT(); // k×n or n×n (this is V^T)
|
||||
// MatrixXd X = svd.solve(B); // pseudoinverse
|
||||
// MatrixXd X = svd.solve(B, k); // truncated (top k triplets)
|
||||
// MatrixXd X = svd.solve(B, 0.1); // Tikhonov regularized
|
||||
|
||||
#ifndef EIGEN_GPU_SVD_H
|
||||
#define EIGEN_GPU_SVD_H
|
||||
|
||||
// IWYU pragma: private
|
||||
#include "./InternalHeaderCheck.h"
|
||||
|
||||
#include "./CuSolverSupport.h"
|
||||
#include "./CuBlasSupport.h"
|
||||
#include <vector>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template <typename Scalar_>
|
||||
class GpuSVD {
|
||||
public:
|
||||
using Scalar = Scalar_;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||
using RealVector = Matrix<RealScalar, Dynamic, 1>;
|
||||
|
||||
GpuSVD() { init_context(); }
|
||||
|
||||
template <typename InputType>
|
||||
explicit GpuSVD(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
|
||||
init_context();
|
||||
compute(A, options);
|
||||
}
|
||||
|
||||
~GpuSVD() {
|
||||
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||
if (cublas_) (void)cublasDestroy(cublas_);
|
||||
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||
}
|
||||
|
||||
GpuSVD(const GpuSVD&) = delete;
|
||||
GpuSVD& operator=(const GpuSVD&) = delete;
|
||||
// Move constructors omitted for brevity — follow GpuQR pattern.
|
||||
|
||||
// ---- Factorization -------------------------------------------------------
|
||||
|
||||
template <typename InputType>
|
||||
GpuSVD& compute(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
|
||||
options_ = options;
|
||||
m_ = A.rows();
|
||||
n_ = A.cols();
|
||||
info_ = InvalidInput;
|
||||
info_synced_ = false;
|
||||
|
||||
if (m_ == 0 || n_ == 0) {
|
||||
info_ = Success;
|
||||
info_synced_ = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// cuSOLVER gesvd requires m >= n. For wide matrices, transpose internally.
|
||||
transposed_ = (m_ < n_);
|
||||
const PlainMatrix mat = transposed_ ? PlainMatrix(A.derived().adjoint()) : PlainMatrix(A.derived());
|
||||
if (transposed_) std::swap(m_, n_);
|
||||
|
||||
lda_ = static_cast<int64_t>(mat.rows());
|
||||
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||
|
||||
// Copy (possibly transposed) A to device (gesvd overwrites it).
|
||||
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
GpuSVD& compute(const DeviceMatrix<Scalar>& d_A, unsigned int options = ComputeThinU | ComputeThinV) {
|
||||
options_ = options;
|
||||
m_ = d_A.rows();
|
||||
n_ = d_A.cols();
|
||||
info_ = InvalidInput;
|
||||
info_synced_ = false;
|
||||
|
||||
if (m_ == 0 || n_ == 0) {
|
||||
info_ = Success;
|
||||
info_synced_ = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
transposed_ = (m_ < n_);
|
||||
d_A.waitReady(stream_);
|
||||
|
||||
if (transposed_) {
|
||||
// Transpose on device via cuBLAS geam: d_A_ = A^H.
|
||||
std::swap(m_, n_);
|
||||
lda_ = m_;
|
||||
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||
Scalar alpha_one(1), beta_zero(0);
|
||||
// geam: C(m×n) = alpha * op(A) + beta * op(B). Use B = nullptr trick: beta=0.
|
||||
// A is the original d_A (n_orig × m_orig = n × m after swap), transposed → m × n.
|
||||
EIGEN_CUBLAS_CHECK(internal::cublasXgeam(
|
||||
cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(m_), static_cast<int>(n_), &alpha_one, d_A.data(),
|
||||
static_cast<int>(d_A.rows()), &beta_zero, static_cast<const Scalar*>(nullptr), static_cast<int>(m_),
|
||||
static_cast<Scalar*>(d_A_.ptr), static_cast<int>(m_)));
|
||||
} else {
|
||||
lda_ = static_cast<int64_t>(d_A.rows());
|
||||
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||
}
|
||||
|
||||
factorize();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ---- Accessors -----------------------------------------------------------
|
||||
|
||||
ComputationInfo info() const {
|
||||
sync_info();
|
||||
return info_;
|
||||
}
|
||||
|
||||
Index rows() const { return transposed_ ? n_ : m_; }
|
||||
Index cols() const { return transposed_ ? m_ : n_; }
|
||||
|
||||
/** Singular values (always available). Downloads from device on each call. */
|
||||
RealVector singularValues() const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success);
|
||||
const Index k = (std::min)(m_, n_);
|
||||
RealVector S(k);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpy(S.data(), d_S_.ptr, static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
|
||||
return S;
|
||||
}
|
||||
|
||||
/** Left singular vectors U. Returns m_orig × k or m_orig × m_orig.
|
||||
* For transposed case (m_orig < n_orig), U comes from cuSOLVER's VT. */
|
||||
PlainMatrix matrixU() const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success);
|
||||
eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "matrixU() requires ComputeThinU or ComputeFullU");
|
||||
const Index m_orig = transposed_ ? n_ : m_;
|
||||
const Index n_orig = transposed_ ? m_ : n_;
|
||||
const Index k = (std::min)(m_orig, n_orig);
|
||||
if (!transposed_) {
|
||||
const Index ucols = (options_ & ComputeFullU) ? m_ : k;
|
||||
PlainMatrix U(m_, ucols);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U.data(), d_U_.ptr,
|
||||
static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost));
|
||||
return U;
|
||||
} else {
|
||||
// Transposed: U_orig = VT_stored^H. VT_stored is vtrows × n_ (= vtrows × m_orig).
|
||||
const Index vtrows = (options_ & ComputeFullU) ? m_orig : k; // Note: FullU maps to FullV of A^H
|
||||
PlainMatrix VT_stored(vtrows, n_);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT_stored.data(), d_VT_.ptr,
|
||||
static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost));
|
||||
return VT_stored.adjoint(); // m_orig × vtrows
|
||||
}
|
||||
}
|
||||
|
||||
/** Right singular vectors transposed V^T. Returns k × n_orig or n_orig × n_orig.
|
||||
* For transposed case, VT comes from cuSOLVER's U. */
|
||||
PlainMatrix matrixVT() const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success);
|
||||
eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "matrixVT() requires ComputeThinV or ComputeFullV");
|
||||
const Index m_orig = transposed_ ? n_ : m_;
|
||||
const Index n_orig = transposed_ ? m_ : n_;
|
||||
const Index k = (std::min)(m_orig, n_orig);
|
||||
if (!transposed_) {
|
||||
const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
|
||||
PlainMatrix VT(vtrows, n_);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT.data(), d_VT_.ptr,
|
||||
static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost));
|
||||
return VT;
|
||||
} else {
|
||||
// Transposed: VT_orig = U_stored^H. U_stored is m_ × ucols (= n_orig × ucols).
|
||||
const Index ucols = (options_ & ComputeFullV) ? n_orig : k; // FullV maps to FullU of A^H
|
||||
PlainMatrix U_stored(m_, ucols);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U_stored.data(), d_U_.ptr,
|
||||
static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost));
|
||||
return U_stored.adjoint(); // ucols × n_orig
|
||||
}
|
||||
}
|
||||
|
||||
/** Number of singular values above threshold. */
|
||||
Index rank(RealScalar threshold = RealScalar(-1)) const {
|
||||
RealVector S = singularValues();
|
||||
if (S.size() == 0) return 0;
|
||||
if (threshold < 0) {
|
||||
threshold = (std::max)(m_, n_) * S(0) * NumTraits<RealScalar>::epsilon();
|
||||
}
|
||||
Index r = 0;
|
||||
for (Index i = 0; i < S.size(); ++i) {
|
||||
if (S(i) > threshold) ++r;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
// ---- Solve ---------------------------------------------------------------
|
||||
|
||||
/** Pseudoinverse solve: X = V * diag(1/S) * U^H * B. */
|
||||
template <typename Rhs>
|
||||
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
|
||||
return solve_impl(B, (std::min)(m_, n_), RealScalar(0));
|
||||
}
|
||||
|
||||
/** Truncated solve: use only top trunc singular triplets. */
|
||||
template <typename Rhs>
|
||||
PlainMatrix solve(const MatrixBase<Rhs>& B, Index trunc) const {
|
||||
eigen_assert(trunc > 0 && trunc <= (std::min)(m_, n_));
|
||||
return solve_impl(B, trunc, RealScalar(0));
|
||||
}
|
||||
|
||||
/** Tikhonov-regularized solve: D_ii = S_i / (S_i^2 + lambda^2). */
|
||||
template <typename Rhs>
|
||||
PlainMatrix solve(const MatrixBase<Rhs>& B, RealScalar lambda) const {
|
||||
eigen_assert(lambda > 0);
|
||||
return solve_impl(B, (std::min)(m_, n_), lambda);
|
||||
}
|
||||
|
||||
cudaStream_t stream() const { return stream_; }
|
||||
|
||||
private:
|
||||
cudaStream_t stream_ = nullptr;
|
||||
cusolverDnHandle_t handle_ = nullptr;
|
||||
cublasHandle_t cublas_ = nullptr;
|
||||
internal::CusolverParams params_;
|
||||
internal::DeviceBuffer d_A_; // working copy of A (overwritten by gesvd)
|
||||
internal::DeviceBuffer d_U_; // left singular vectors
|
||||
internal::DeviceBuffer d_S_; // singular values (RealScalar)
|
||||
internal::DeviceBuffer d_VT_; // right singular vectors transposed
|
||||
internal::DeviceBuffer d_scratch_; // workspace + info
|
||||
size_t scratch_size_ = 0;
|
||||
std::vector<char> h_workspace_;
|
||||
unsigned int options_ = 0;
|
||||
Index m_ = 0;
|
||||
Index n_ = 0;
|
||||
int64_t lda_ = 0;
|
||||
bool transposed_ = false; // true if m < n (we compute SVD of A^T internally)
|
||||
ComputationInfo info_ = InvalidInput;
|
||||
int info_word_ = 0;
|
||||
bool info_synced_ = true;
|
||||
|
||||
void init_context() {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
|
||||
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
|
||||
ensure_scratch(0);
|
||||
}
|
||||
|
||||
void ensure_scratch(size_t workspace_bytes) {
|
||||
constexpr size_t kAlign = 16;
|
||||
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||
size_t needed = workspace_bytes + sizeof(int);
|
||||
if (needed > scratch_size_) {
|
||||
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
d_scratch_ = internal::DeviceBuffer(needed);
|
||||
scratch_size_ = needed;
|
||||
}
|
||||
}
|
||||
|
||||
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||
int* scratch_info() const {
|
||||
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||
}
|
||||
|
||||
void sync_info() const {
|
||||
if (!info_synced_) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
const_cast<GpuSVD*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||
const_cast<GpuSVD*>(this)->info_synced_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Swap U↔V flags for the transposed case.
|
||||
static unsigned int swap_uv_options(unsigned int opts) {
|
||||
unsigned int result = 0;
|
||||
if (opts & ComputeThinU) result |= ComputeThinV;
|
||||
if (opts & ComputeFullU) result |= ComputeFullV;
|
||||
if (opts & ComputeThinV) result |= ComputeThinU;
|
||||
if (opts & ComputeFullV) result |= ComputeFullU;
|
||||
return result;
|
||||
}
|
||||
|
||||
static signed char jobu(unsigned int opts) {
|
||||
if (opts & ComputeFullU) return 'A';
|
||||
if (opts & ComputeThinU) return 'S';
|
||||
return 'N';
|
||||
}
|
||||
|
||||
static signed char jobvt(unsigned int opts) {
|
||||
if (opts & ComputeFullV) return 'A';
|
||||
if (opts & ComputeThinV) return 'S';
|
||||
return 'N';
|
||||
}
|
||||
|
||||
void factorize() {
|
||||
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||
constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
|
||||
const Index k = (std::min)(m_, n_);
|
||||
|
||||
info_synced_ = false;
|
||||
info_ = InvalidInput;
|
||||
|
||||
// Allocate output buffers. When transposed, swap U/V roles for cuSOLVER.
|
||||
d_S_ = internal::DeviceBuffer(static_cast<size_t>(k) * sizeof(RealScalar));
|
||||
|
||||
// Internal options: for transposed case, what user wants as U we compute as VT of A^H.
|
||||
const unsigned int int_opts = transposed_ ? swap_uv_options(options_) : options_;
|
||||
|
||||
const Index ucols = (int_opts & ComputeFullU) ? m_ : ((int_opts & ComputeThinU) ? k : 0);
|
||||
const Index vtrows = (int_opts & ComputeFullV) ? n_ : ((int_opts & ComputeThinV) ? k : 0);
|
||||
const int64_t ldu = m_;
|
||||
const int64_t ldvt = vtrows > 0 ? vtrows : 1;
|
||||
|
||||
if (ucols > 0) d_U_ = internal::DeviceBuffer(static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar));
|
||||
if (vtrows > 0)
|
||||
d_VT_ = internal::DeviceBuffer(static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar));
|
||||
|
||||
// computeType must match the matrix data type (dtype), not the singular value type (rtype).
|
||||
eigen_assert(m_ >= n_ && "Internal error: m_ < n_ should have been handled by transpose in compute()");
|
||||
size_t dev_ws = 0, host_ws = 0;
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd_bufferSize(
|
||||
handle_, params_.p, jobu(int_opts), jobvt(int_opts), static_cast<int64_t>(m_), static_cast<int64_t>(n_), dtype,
|
||||
d_A_.ptr, lda_, rtype, d_S_.ptr, dtype, ucols > 0 ? d_U_.ptr : nullptr, ldu, dtype,
|
||||
vtrows > 0 ? d_VT_.ptr : nullptr, ldvt, dtype, &dev_ws, &host_ws));
|
||||
|
||||
ensure_scratch(dev_ws);
|
||||
h_workspace_.resize(host_ws);
|
||||
|
||||
// Compute SVD.
|
||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd(handle_, params_.p, jobu(int_opts), jobvt(int_opts), static_cast<int64_t>(m_),
|
||||
static_cast<int64_t>(n_), dtype, d_A_.ptr, lda_, rtype, d_S_.ptr, dtype,
|
||||
ucols > 0 ? d_U_.ptr : nullptr, ldu, dtype, vtrows > 0 ? d_VT_.ptr : nullptr,
|
||||
ldvt, dtype, scratch_workspace(), dev_ws,
|
||||
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||
}
|
||||
|
||||
// Internal solve: X = V * diag(D) * U^H * B, using top `trunc` triplets.
|
||||
// D_ii = 1/S_i (if lambda==0) or S_i/(S_i^2+lambda^2).
|
||||
//
|
||||
// For non-transposed: stored U, VT. X = VT^H * D * U^H * B.
|
||||
// For transposed (SVD of A^H): stored U', VT'. X = U' * D * VT' * B.
|
||||
template <typename Rhs>
|
||||
PlainMatrix solve_impl(const MatrixBase<Rhs>& B, Index trunc, RealScalar lambda) const {
|
||||
sync_info();
|
||||
eigen_assert(info_ == Success && "GpuSVD::solve called on a failed or uninitialized decomposition");
|
||||
eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "solve requires U");
|
||||
eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "solve requires V");
|
||||
|
||||
const Index m_orig = transposed_ ? n_ : m_;
|
||||
const Index n_orig = transposed_ ? m_ : n_;
|
||||
eigen_assert(B.rows() == m_orig);
|
||||
|
||||
const Index k = (std::min)(m_, n_); // = min(m_orig, n_orig)
|
||||
const Index kk = (std::min)(trunc, k);
|
||||
const Index nrhs = B.cols();
|
||||
|
||||
// Download S to host to build the diagonal scaling.
|
||||
RealVector S(k);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(
|
||||
cudaMemcpy(S.data(), d_S_.ptr, static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
|
||||
// Upload B (m_orig × nrhs).
|
||||
const PlainMatrix rhs(B);
|
||||
internal::DeviceBuffer d_B(static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.ptr, rhs.data(),
|
||||
static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||
cudaMemcpyHostToDevice, stream_));
|
||||
|
||||
// Step 1: tmp = U_orig^H * B (kk × nrhs).
|
||||
// Non-transposed: U_stored is m_×ucols, U_orig = U_stored. Use U_stored^H * B.
|
||||
// Transposed: U_orig = VT_stored^H, so U_orig^H = VT_stored. Use VT_stored * B (no transpose!).
|
||||
internal::DeviceBuffer d_tmp(static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar));
|
||||
{
|
||||
Scalar alpha_one(1), beta_zero(0);
|
||||
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
|
||||
constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;
|
||||
|
||||
if (!transposed_) {
|
||||
// U_stored^H * B: (m_×kk)^H × (m_×nrhs) → kk×nrhs.
|
||||
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs),
|
||||
static_cast<int>(m_), &alpha_one, d_U_.ptr, dtype, static_cast<int>(m_),
|
||||
d_B.ptr, dtype, static_cast<int>(m_orig), &beta_zero, d_tmp.ptr, dtype,
|
||||
static_cast<int>(kk), compute, internal::cuda_gemm_algo()));
|
||||
} else {
|
||||
// VT_stored * B: VT_stored is vtrows×n_ = kk×m_orig (thin), NoTrans.
|
||||
// vtrows×m_orig times m_orig×nrhs → vtrows×nrhs. Use first kk rows.
|
||||
const Index vtrows_stored = (swap_uv_options(options_) & ComputeFullV) ? n_ : k;
|
||||
EIGEN_CUBLAS_CHECK(cublasGemmEx(
|
||||
cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs), static_cast<int>(m_orig),
|
||||
&alpha_one, d_VT_.ptr, dtype, static_cast<int>(vtrows_stored), d_B.ptr, dtype, static_cast<int>(m_orig),
|
||||
&beta_zero, d_tmp.ptr, dtype, static_cast<int>(kk), compute, internal::cuda_gemm_algo()));
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Scale row i of tmp by D_ii.
|
||||
// Download tmp to host, scale, re-upload. (Simple and correct; a device kernel would be faster.)
|
||||
{
|
||||
PlainMatrix tmp(kk, nrhs);
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(tmp.data(), d_tmp.ptr,
|
||||
static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
|
||||
for (Index i = 0; i < kk; ++i) {
|
||||
RealScalar si = S(i);
|
||||
RealScalar di = (lambda == RealScalar(0)) ? (si > 0 ? RealScalar(1) / si : RealScalar(0))
|
||||
: si / (si * si + lambda * lambda);
|
||||
tmp.row(i) *= Scalar(di);
|
||||
}
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_tmp.ptr, tmp.data(),
|
||||
static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||
cudaMemcpyHostToDevice, stream_));
|
||||
}
|
||||
|
||||
// Step 3: X = V_orig * tmp (n_orig × nrhs).
|
||||
// Non-transposed: V_orig = VT_stored^H. VT_stored[:kk,:]^H * tmp → n_orig × nrhs.
|
||||
// Transposed: V_orig = U_stored[:,:kk]. U_stored * tmp → n_orig × nrhs (NoTrans).
|
||||
PlainMatrix X(n_orig, nrhs);
|
||||
{
|
||||
internal::DeviceBuffer d_X(static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
|
||||
Scalar alpha_one(1), beta_zero(0);
|
||||
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
|
||||
constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;
|
||||
|
||||
if (!transposed_) {
|
||||
const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
|
||||
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(n_orig),
|
||||
static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_VT_.ptr, dtype,
|
||||
static_cast<int>(vtrows), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero,
|
||||
d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo()));
|
||||
} else {
|
||||
// U_stored is m_×ucols. V_orig = U_stored[:,:kk]. NoTrans × tmp.
|
||||
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(n_orig),
|
||||
static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_U_.ptr, dtype,
|
||||
static_cast<int>(m_), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero,
|
||||
d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo()));
|
||||
}
|
||||
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_X.ptr,
|
||||
static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToHost, stream_));
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||
}
|
||||
|
||||
return X;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Eigen
|
||||
|
||||
#endif // EIGEN_GPU_SVD_H
|
||||
@@ -160,11 +160,40 @@ MatrixXd X2 = d_X2.toHost();
|
||||
GpuLU<double> lu;
|
||||
lu.compute(d_A);
|
||||
auto d_Y = lu.solve(d_B, GpuLU<double>::Transpose); // A^T Y = B
|
||||
|
||||
// QR solve (overdetermined least squares)
|
||||
GpuQR<double> qr(A); // host matrix input
|
||||
MatrixXd X = qr.solve(B); // Q^H * B via ormqr, then trsm on R
|
||||
|
||||
// SVD
|
||||
GpuSVD<double> svd(A, ComputeThinU | ComputeThinV);
|
||||
VectorXd S = svd.singularValues();
|
||||
MatrixXd U = svd.matrixU();
|
||||
MatrixXd VT = svd.matrixVT();
|
||||
MatrixXd X = svd.solve(B); // pseudoinverse solve
|
||||
|
||||
// Self-adjoint eigenvalue decomposition
|
||||
GpuSelfAdjointEigenSolver<double> es(A);
|
||||
VectorXd eigenvals = es.eigenvalues();
|
||||
MatrixXd eigenvecs = es.eigenvectors();
|
||||
```
|
||||
|
||||
The cached API keeps the factored matrix on device, avoiding redundant
|
||||
host-device transfers and re-factorizations.
|
||||
|
||||
### Precision control
|
||||
|
||||
GEMM dispatch enables tensor core algorithms by default, allowing cuBLAS to
|
||||
choose the fastest algorithm for the given precision and architecture. For
|
||||
double precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64
|
||||
results computed faster via tensor cores.
|
||||
|
||||
| Macro | Effect |
|
||||
|---|---|
|
||||
| *(default)* | Tensor core algorithms enabled. Float uses full FP32. Double may use Ozaki on sm_80+. |
|
||||
| `EIGEN_CUDA_TF32` | Opt-in: Float uses TF32 (~2x faster, 10-bit mantissa). Double unaffected. |
|
||||
| `EIGEN_NO_CUDA_TENSOR_OPS` | Opt-out: Pedantic compute types, no tensor cores. For bit-exact reproducibility. |
|
||||
|
||||
### Stream control and async execution
|
||||
|
||||
Operations are asynchronous by default. The compute-solve chain runs without
|
||||
@@ -271,6 +300,64 @@ GPU dense partial-pivoting LU via cuSOLVER. Same pattern as `GpuLLT`, plus
|
||||
`TransposeMode` parameter on `solve()` (`NoTranspose`, `Transpose`,
|
||||
`ConjugateTranspose`).
|
||||
|
||||
### `GpuQR<Scalar>` API
|
||||
|
||||
GPU dense QR decomposition via cuSOLVER (`geqrf`). Solve uses `ormqr` (apply
|
||||
Q^H) + `trsm` (back-substitute on R) -- Q is never formed explicitly.
|
||||
|
||||
| Method | Description |
|
||||
|--------|-------------|
|
||||
| `GpuQR()` | Default construct, then call `compute()` |
|
||||
| `GpuQR(A)` | Construct and factorize from host matrix |
|
||||
| `compute(A)` | Upload + factorize |
|
||||
| `compute(DeviceMatrix)` | D2D copy + factorize |
|
||||
| `solve(host_matrix)` | Solve, return host matrix (syncs) |
|
||||
| `solve(DeviceMatrix)` | Solve, return `DeviceMatrix` (async) |
|
||||
| `info()` | Lazy sync |
|
||||
| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
|
||||
|
||||
### `GpuSVD<Scalar>` API
|
||||
|
||||
GPU dense SVD via cuSOLVER (`gesvd`). Supports thin, full, and values-only
|
||||
modes via Eigen's `ComputeThinU | ComputeThinV`, `ComputeFullU | ComputeFullV`,
|
||||
or `0` (values only).
|
||||
|
||||
| Method | Description |
|
||||
|--------|-------------|
|
||||
| `GpuSVD()` | Default construct, then call `compute()` |
|
||||
| `GpuSVD(A, options)` | Construct and compute (options default: `ComputeThinU \| ComputeThinV`) |
|
||||
| `compute(A, options)` | Compute from host matrix |
|
||||
| `compute(DeviceMatrix, options)` | Compute from device matrix |
|
||||
| `singularValues()` | Download singular values to host |
|
||||
| `matrixU()` | Download U to host (requires `ComputeThinU` or `ComputeFullU`) |
|
||||
| `matrixVT()` | Download V^T to host (requires `ComputeThinV` or `ComputeFullV`) |
|
||||
| `solve(B)` | Pseudoinverse solve (returns host matrix) |
|
||||
| `solve(B, k)` | Truncated solve (top k singular triplets) |
|
||||
| `solve(B, lambda)` | Tikhonov regularized solve |
|
||||
| `rank(threshold)` | Count singular values above threshold |
|
||||
| `info()` | Lazy sync |
|
||||
| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
|
||||
|
||||
Wide matrices (m < n) are handled by internally transposing via cuBLAS `geam`.
|
||||
|
||||
### `GpuSelfAdjointEigenSolver<Scalar>` API
|
||||
|
||||
GPU symmetric/Hermitian eigenvalue decomposition via cuSOLVER (`syevd`).
|
||||
|
||||
| Method | Description |
|
||||
|--------|-------------|
|
||||
| `GpuSelfAdjointEigenSolver()` | Default construct, then call `compute()` |
|
||||
| `GpuSelfAdjointEigenSolver(A, mode)` | Construct and compute (mode default: `ComputeEigenvectors`) |
|
||||
| `compute(A, mode)` | Compute from host matrix |
|
||||
| `compute(DeviceMatrix, mode)` | Compute from device matrix |
|
||||
| `eigenvalues()` | Download eigenvalues to host (ascending order) |
|
||||
| `eigenvectors()` | Download eigenvectors to host (columns) |
|
||||
| `info()` | Lazy sync |
|
||||
| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
|
||||
|
||||
`ComputeMode`: `GpuSelfAdjointEigenSolver::EigenvaluesOnly` or
|
||||
`GpuSelfAdjointEigenSolver::ComputeEigenvectors`.
|
||||
|
||||
### `HostTransfer<Scalar>` API
|
||||
|
||||
Future for async device-to-host transfer.
|
||||
@@ -303,6 +390,9 @@ The caller must ensure operands don't alias the destination for GEMM and TRSM
|
||||
| `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
|
||||
| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
|
||||
| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
|
||||
| `GpuQR.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense QR decomposition |
|
||||
| `GpuSVD.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense SVD decomposition |
|
||||
| `GpuEigenSolver.h` | `CuSolverSupport.h` | Self-adjoint eigenvalue decomposition |
|
||||
|
||||
## Building and testing
|
||||
|
||||
@@ -313,6 +403,7 @@ cmake -G Ninja -B build -S . \
|
||||
-DEIGEN_TEST_CUBLAS=ON \
|
||||
-DEIGEN_TEST_CUSOLVER=ON
|
||||
|
||||
cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu gpu_device_matrix
|
||||
cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu \
|
||||
gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen gpu_device_matrix
|
||||
ctest --test-dir build -R "gpu_cublas|gpu_cusolver|gpu_device" --output-on-failure
|
||||
```
|
||||
|
||||
@@ -528,7 +528,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
||||
# compiler and linked against CUDA runtime + cuSOLVER. This avoids NVCC
|
||||
# instantiating Eigen's CPU packet operations for CUDA vector types.
|
||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||
foreach(_cusolver_test IN ITEMS gpu_cusolver_llt gpu_cusolver_lu)
|
||||
foreach(_cusolver_test IN ITEMS gpu_cusolver_llt gpu_cusolver_lu gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen)
|
||||
add_executable(${_cusolver_test} ${_cusolver_test}.cpp)
|
||||
target_include_directories(${_cusolver_test} PRIVATE
|
||||
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||
|
||||
@@ -16,6 +16,32 @@
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// Unit roundoff for GPU GEMM compute precision.
|
||||
// TF32 (opt-in via EIGEN_CUDA_TF32) has eps ~ 2^{-10}.
|
||||
template <typename Scalar>
|
||||
typename NumTraits<Scalar>::Real gpu_unit_roundoff() {
|
||||
#if defined(EIGEN_CUDA_TF32) && !defined(EIGEN_NO_CUDA_TENSOR_OPS)
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
if (std::is_same<RealScalar, float>::value) return RealScalar(9.8e-4);
|
||||
#endif
|
||||
return NumTraits<Scalar>::epsilon();
|
||||
}
|
||||
|
||||
// Higham-Mary probabilistic error bound for GEMM:
|
||||
// ||C - fl(C)||_F <= lambda * sqrt(k) * u * ||A||_F * ||B||_F
|
||||
// where k is the inner dimension, u is the unit roundoff, and
|
||||
// lambda = sqrt(2 * ln(2/delta)) with delta = failure probability.
|
||||
// lambda = 5 corresponds to delta ~ 10^{-6}.
|
||||
// Reference: Higham & Mary, "Probabilistic Error Analysis for Inner Products",
|
||||
// SIAM J. Matrix Anal. Appl., 2019.
|
||||
template <typename Scalar>
|
||||
typename NumTraits<Scalar>::Real gemm_error_bound(Index k, typename NumTraits<Scalar>::Real normA,
|
||||
typename NumTraits<Scalar>::Real normB) {
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
constexpr RealScalar lambda = 5;
|
||||
return lambda * std::sqrt(static_cast<RealScalar>(k)) * gpu_unit_roundoff<Scalar>() * normA * normB;
|
||||
}
|
||||
|
||||
// ---- Basic GEMM: C = A * B -------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
@@ -36,7 +62,7 @@ void test_gemm_basic(Index m, Index n, Index k) {
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B;
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
@@ -59,7 +85,7 @@ void test_gemm_adjoint_lhs(Index m, Index n, Index k) {
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A.adjoint() * B;
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
@@ -82,7 +108,7 @@ void test_gemm_transpose_rhs(Index m, Index n, Index k) {
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B.transpose();
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
@@ -106,7 +132,7 @@ void test_gemm_scaled(Index m, Index n, Index k) {
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = alpha * A * B;
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
@@ -130,7 +156,7 @@ void test_gemm_accumulate(Index m, Index n, Index k) {
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = C_init + A * B;
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
@@ -153,7 +179,7 @@ void test_gemm_accumulate_empty(Index m, Index n, Index k) {
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B;
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
@@ -178,7 +204,7 @@ void test_gemm_subtract(Index m, Index n, Index k) {
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = C_init - A * B;
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
@@ -202,7 +228,7 @@ void test_gemm_subtract_empty(Index m, Index n, Index k) {
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = -(A * B);
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
@@ -226,7 +252,7 @@ void test_gemm_scaled_rhs(Index m, Index n, Index k) {
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * (alpha * B);
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
@@ -266,7 +292,7 @@ void test_gemm_explicit_context(Index m, Index n, Index k) {
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B;
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
@@ -296,7 +322,7 @@ void test_gemm_cross_context_reuse(Index n) {
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B + D * E;
|
||||
|
||||
RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm()) + gemm_error_bound<Scalar>(n, D.norm(), E.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
@@ -326,7 +352,7 @@ void test_gemm_cross_context_resize() {
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = D * E;
|
||||
|
||||
RealScalar tol = RealScalar(16) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
RealScalar tol = gemm_error_bound<Scalar>(16, D.norm(), E.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
@@ -353,7 +379,9 @@ void test_gemm_chain(Index n) {
|
||||
Mat D = d_D.toHost();
|
||||
Mat D_ref = (A * B) * E;
|
||||
|
||||
RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * D_ref.norm();
|
||||
Mat C_ref = A * B;
|
||||
RealScalar tol =
|
||||
gemm_error_bound<Scalar>(n, A.norm(), B.norm()) * E.norm() + gemm_error_bound<Scalar>(n, C_ref.norm(), E.norm());
|
||||
VERIFY((D - D_ref).norm() < tol);
|
||||
}
|
||||
|
||||
@@ -401,7 +429,7 @@ void test_llt_solve_expr(Index n, Index nrhs) {
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||
}
|
||||
|
||||
// ---- LLT solve with explicit context ----------------------------------------
|
||||
@@ -423,7 +451,7 @@ void test_llt_solve_expr_context(Index n, Index nrhs) {
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||
}
|
||||
|
||||
// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------
|
||||
@@ -444,7 +472,7 @@ void test_lu_solve_expr(Index n, Index nrhs) {
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||
}
|
||||
|
||||
// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------
|
||||
@@ -474,7 +502,7 @@ void test_gemm_then_solve(Index n) {
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (C * X - D).norm() / D.norm();
|
||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||
}
|
||||
|
||||
// ---- LLT solve with Upper triangle -----------------------------------------
|
||||
@@ -495,7 +523,7 @@ void test_llt_solve_upper(Index n, Index nrhs) {
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||
}
|
||||
|
||||
// ---- LU solve with explicit context -----------------------------------------
|
||||
@@ -517,7 +545,7 @@ void test_lu_solve_expr_context(Index n, Index nrhs) {
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||
}
|
||||
|
||||
// ---- Zero-nrhs solver expressions ------------------------------------------
|
||||
@@ -581,7 +609,7 @@ void test_trsm(Index n, Index nrhs) {
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||
}
|
||||
|
||||
// ---- SYMM/HEMM: selfadjointView<UpLo>() * B --------------------------------
|
||||
@@ -603,7 +631,7 @@ void test_symm(Index n, Index nrhs) {
|
||||
Mat C = d_C.toHost();
|
||||
Mat C_ref = A * B; // A is symmetric, so full multiply == symm
|
||||
|
||||
RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm());
|
||||
VERIFY((C - C_ref).norm() < tol);
|
||||
}
|
||||
|
||||
@@ -629,7 +657,7 @@ void test_syrk(Index n, Index k) {
|
||||
Mat C_lower = C.template triangularView<Lower>();
|
||||
Mat C_ref_lower = C_ref.template triangularView<Lower>();
|
||||
|
||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
||||
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), A.norm());
|
||||
VERIFY((C_lower - C_ref_lower).norm() < tol);
|
||||
}
|
||||
|
||||
|
||||
180
test/gpu_cusolver_eigen.cpp
Normal file
180
test/gpu_cusolver_eigen.cpp
Normal file
@@ -0,0 +1,180 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for GpuSelfAdjointEigenSolver: GPU symmetric/Hermitian eigenvalue
|
||||
// decomposition via cuSOLVER.
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/Eigenvalues>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// ---- Reconstruction: V * diag(W) * V^H ≈ A ---------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_eigen_reconstruction(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
// Build a symmetric/Hermitian matrix.
|
||||
Mat R = Mat::Random(n, n);
|
||||
Mat A = R + R.adjoint();
|
||||
|
||||
GpuSelfAdjointEigenSolver<Scalar> es(A);
|
||||
VERIFY_IS_EQUAL(es.info(), Success);
|
||||
|
||||
auto W = es.eigenvalues();
|
||||
Mat V = es.eigenvectors();
|
||||
|
||||
VERIFY_IS_EQUAL(W.size(), n);
|
||||
VERIFY_IS_EQUAL(V.rows(), n);
|
||||
VERIFY_IS_EQUAL(V.cols(), n);
|
||||
|
||||
// Reconstruct: A_hat = V * diag(W) * V^H.
|
||||
Mat A_hat = V * W.asDiagonal() * V.adjoint();
|
||||
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||
VERIFY((A_hat - A).norm() < tol);
|
||||
|
||||
// Orthogonality: V^H * V ≈ I.
|
||||
Mat VhV = V.adjoint() * V;
|
||||
Mat eye = Mat::Identity(n, n);
|
||||
VERIFY((VhV - eye).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Eigenvalues match CPU SelfAdjointEigenSolver ---------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_eigen_values(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat R = Mat::Random(n, n);
|
||||
Mat A = R + R.adjoint();
|
||||
|
||||
GpuSelfAdjointEigenSolver<Scalar> gpu_es(A);
|
||||
VERIFY_IS_EQUAL(gpu_es.info(), Success);
|
||||
auto W_gpu = gpu_es.eigenvalues();
|
||||
|
||||
SelfAdjointEigenSolver<Mat> cpu_es(A);
|
||||
auto W_cpu = cpu_es.eigenvalues();
|
||||
|
||||
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() *
|
||||
W_cpu.cwiseAbs().maxCoeff();
|
||||
VERIFY((W_gpu - W_cpu).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Eigenvalues-only mode --------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_eigen_values_only(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat R = Mat::Random(n, n);
|
||||
Mat A = R + R.adjoint();
|
||||
|
||||
GpuSelfAdjointEigenSolver<Scalar> gpu_es(A, GpuSelfAdjointEigenSolver<Scalar>::EigenvaluesOnly);
|
||||
VERIFY_IS_EQUAL(gpu_es.info(), Success);
|
||||
auto W_gpu = gpu_es.eigenvalues();
|
||||
|
||||
SelfAdjointEigenSolver<Mat> cpu_es(A, EigenvaluesOnly);
|
||||
auto W_cpu = cpu_es.eigenvalues();
|
||||
|
||||
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() *
|
||||
W_cpu.cwiseAbs().maxCoeff();
|
||||
VERIFY((W_gpu - W_cpu).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- DeviceMatrix input path ------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_eigen_device_matrix(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat R = Mat::Random(n, n);
|
||||
Mat A = R + R.adjoint();
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
GpuSelfAdjointEigenSolver<Scalar> es;
|
||||
es.compute(d_A);
|
||||
VERIFY_IS_EQUAL(es.info(), Success);
|
||||
|
||||
auto W_gpu = es.eigenvalues();
|
||||
Mat V = es.eigenvectors();
|
||||
|
||||
// Verify reconstruction.
|
||||
Mat A_hat = V * W_gpu.asDiagonal() * V.adjoint();
|
||||
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||
VERIFY((A_hat - A).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Recompute (reuse solver object) ----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_eigen_recompute(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
GpuSelfAdjointEigenSolver<Scalar> es;
|
||||
|
||||
for (int trial = 0; trial < 3; ++trial) {
|
||||
Mat R = Mat::Random(n, n);
|
||||
Mat A = R + R.adjoint();
|
||||
es.compute(A);
|
||||
VERIFY_IS_EQUAL(es.info(), Success);
|
||||
|
||||
auto W = es.eigenvalues();
|
||||
Mat V = es.eigenvectors();
|
||||
Mat A_hat = V * W.asDiagonal() * V.adjoint();
|
||||
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||
VERIFY((A_hat - A).norm() < tol);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Empty matrix -----------------------------------------------------------
|
||||
|
||||
void test_eigen_empty() {
|
||||
GpuSelfAdjointEigenSolver<double> es(MatrixXd(0, 0));
|
||||
VERIFY_IS_EQUAL(es.info(), Success);
|
||||
VERIFY_IS_EQUAL(es.rows(), 0);
|
||||
VERIFY_IS_EQUAL(es.cols(), 0);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver ------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
// Reconstruction + orthogonality.
|
||||
CALL_SUBTEST(test_eigen_reconstruction<Scalar>(64));
|
||||
CALL_SUBTEST(test_eigen_reconstruction<Scalar>(128));
|
||||
|
||||
// Eigenvalues match CPU.
|
||||
CALL_SUBTEST(test_eigen_values<Scalar>(64));
|
||||
CALL_SUBTEST(test_eigen_values<Scalar>(128));
|
||||
|
||||
// Values-only mode.
|
||||
CALL_SUBTEST(test_eigen_values_only<Scalar>(64));
|
||||
|
||||
// DeviceMatrix input.
|
||||
CALL_SUBTEST(test_eigen_device_matrix<Scalar>(64));
|
||||
|
||||
// Recompute.
|
||||
CALL_SUBTEST(test_eigen_recompute<Scalar>(32));
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cusolver_eigen) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_eigen_empty());
|
||||
}
|
||||
185
test/gpu_cusolver_qr.cpp
Normal file
185
test/gpu_cusolver_qr.cpp
Normal file
@@ -0,0 +1,185 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for GpuQR: GPU QR decomposition via cuSOLVER.
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/QR>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// ---- Solve square system: A * X = B -----------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_qr_solve_square(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
GpuQR<Scalar> qr(A);
|
||||
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||
|
||||
Mat X = qr.solve(B);
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- Solve overdetermined system: m > n (least-squares) ---------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_qr_solve_overdetermined(Index m, Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
eigen_assert(m >= n);
|
||||
Mat A = Mat::Random(m, n);
|
||||
Mat B = Mat::Random(m, nrhs);
|
||||
|
||||
GpuQR<Scalar> qr(A);
|
||||
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||
|
||||
Mat X = qr.solve(B);
|
||||
VERIFY_IS_EQUAL(X.rows(), n);
|
||||
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||
|
||||
// Compare with CPU QR.
|
||||
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
|
||||
RealScalar tol = RealScalar(100) * RealScalar(m) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Solve with DeviceMatrix input ------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_qr_solve_device(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuQR<Scalar> qr;
|
||||
qr.compute(d_A);
|
||||
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||
|
||||
DeviceMatrix<Scalar> d_X = qr.solve(d_B);
|
||||
Mat X = d_X.toHost();
|
||||
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||
}
|
||||
|
||||
// ---- Solve overdetermined via device path -----------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_qr_solve_overdetermined_device(Index m, Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
eigen_assert(m >= n);
|
||||
Mat A = Mat::Random(m, n);
|
||||
Mat B = Mat::Random(m, nrhs);
|
||||
|
||||
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||
|
||||
GpuQR<Scalar> qr;
|
||||
qr.compute(d_A);
|
||||
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||
|
||||
DeviceMatrix<Scalar> d_X = qr.solve(d_B);
|
||||
VERIFY_IS_EQUAL(d_X.rows(), n);
|
||||
VERIFY_IS_EQUAL(d_X.cols(), nrhs);
|
||||
|
||||
Mat X = d_X.toHost();
|
||||
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
|
||||
RealScalar tol = RealScalar(100) * RealScalar(m) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Multiple solves reuse the factorization --------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_qr_multiple_solves(Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
GpuQR<Scalar> qr(A);
|
||||
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||
|
||||
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
Mat B = Mat::Random(n, 3);
|
||||
Mat X = qr.solve(B);
|
||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||
VERIFY(residual < tol);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Agreement with CPU HouseholderQR ---------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_qr_vs_cpu(Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(n, n);
|
||||
Mat B = Mat::Random(n, nrhs);
|
||||
|
||||
GpuQR<Scalar> gpu_qr(A);
|
||||
VERIFY_IS_EQUAL(gpu_qr.info(), Success);
|
||||
|
||||
Mat X_gpu = gpu_qr.solve(B);
|
||||
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
|
||||
|
||||
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver ------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
CALL_SUBTEST(test_qr_solve_square<Scalar>(1, 1));
|
||||
CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 1));
|
||||
CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_qr_solve_square<Scalar>(256, 8));
|
||||
|
||||
CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(128, 64, 4));
|
||||
CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(256, 128, 1));
|
||||
|
||||
CALL_SUBTEST(test_qr_solve_device<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_qr_solve_overdetermined_device<Scalar>(128, 64, 4));
|
||||
CALL_SUBTEST(test_qr_multiple_solves<Scalar>(64));
|
||||
CALL_SUBTEST(test_qr_vs_cpu<Scalar>(64, 4));
|
||||
CALL_SUBTEST(test_qr_vs_cpu<Scalar>(256, 8));
|
||||
}
|
||||
|
||||
void test_qr_empty() {
|
||||
GpuQR<double> qr(MatrixXd(0, 0));
|
||||
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||
VERIFY_IS_EQUAL(qr.rows(), 0);
|
||||
VERIFY_IS_EQUAL(qr.cols(), 0);
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cusolver_qr) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_qr_empty());
|
||||
}
|
||||
194
test/gpu_cusolver_svd.cpp
Normal file
194
test/gpu_cusolver_svd.cpp
Normal file
@@ -0,0 +1,194 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
// Tests for GpuSVD: GPU SVD via cuSOLVER.
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
#include "main.h"
|
||||
#include <Eigen/SVD>
|
||||
#include <Eigen/GPU>
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
// ---- SVD reconstruction: U * diag(S) * VT ≈ A ------------------------------
|
||||
|
||||
template <typename Scalar, unsigned int Options>
|
||||
void test_svd_reconstruction(Index m, Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, n);
|
||||
GpuSVD<Scalar> svd(A, Options);
|
||||
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||
|
||||
auto S = svd.singularValues();
|
||||
Mat U = svd.matrixU();
|
||||
Mat VT = svd.matrixVT();
|
||||
|
||||
const Index k = (std::min)(m, n);
|
||||
|
||||
// Reconstruct: A_hat = U[:,:k] * diag(S) * VT[:k,:].
|
||||
Mat A_hat = U.leftCols(k) * S.asDiagonal() * VT.topRows(k);
|
||||
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(k)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||
VERIFY((A_hat - A).norm() < tol);
|
||||
|
||||
// Orthogonality: U^H * U ≈ I.
|
||||
Mat UtU = U.adjoint() * U;
|
||||
Mat I_u = Mat::Identity(U.cols(), U.cols());
|
||||
VERIFY((UtU - I_u).norm() < tol);
|
||||
|
||||
// Orthogonality: VT * VT^H ≈ I.
|
||||
Mat VtVh = VT * VT.adjoint();
|
||||
Mat I_v = Mat::Identity(VT.rows(), VT.rows());
|
||||
VERIFY((VtVh - I_v).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Singular values match CPU BDCSVD ---------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_svd_singular_values(Index m, Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, n);
|
||||
GpuSVD<Scalar> svd(A, 0); // values only
|
||||
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||
|
||||
auto S_gpu = svd.singularValues();
|
||||
auto S_cpu = BDCSVD<Mat>(A, 0).singularValues();
|
||||
|
||||
RealScalar tol =
|
||||
RealScalar(5) * std::sqrt(static_cast<RealScalar>((std::min)(m, n))) * NumTraits<Scalar>::epsilon() * S_cpu(0);
|
||||
VERIFY((S_gpu - S_cpu).norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Solve: pseudoinverse ---------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_svd_solve(Index m, Index n, Index nrhs) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, n);
|
||||
Mat B = Mat::Random(m, nrhs);
|
||||
|
||||
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
|
||||
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||
|
||||
Mat X = svd.solve(B);
|
||||
VERIFY_IS_EQUAL(X.rows(), n);
|
||||
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||
|
||||
// Compare with CPU BDCSVD solve.
|
||||
Mat X_cpu = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV).solve(B);
|
||||
RealScalar tol = RealScalar(100) * RealScalar((std::max)(m, n)) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Solve: truncated -------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_svd_solve_truncated(Index m, Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, n);
|
||||
Mat B = Mat::Random(m, 1);
|
||||
const Index k = (std::min)(m, n);
|
||||
const Index trunc = k / 2;
|
||||
eigen_assert(trunc > 0);
|
||||
|
||||
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
|
||||
Mat X_trunc = svd.solve(B, trunc);
|
||||
|
||||
// Build CPU reference: truncated pseudoinverse.
|
||||
auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
|
||||
auto S = cpu_svd.singularValues();
|
||||
Mat U = cpu_svd.matrixU();
|
||||
Mat V = cpu_svd.matrixV();
|
||||
|
||||
// D_ii = 1/S_i for i < trunc, 0 otherwise.
|
||||
Matrix<RealScalar, Dynamic, 1> D = Matrix<RealScalar, Dynamic, 1>::Zero(k);
|
||||
for (Index i = 0; i < trunc; ++i) D(i) = RealScalar(1) / S(i);
|
||||
Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
|
||||
|
||||
RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((X_trunc - X_ref).norm() / X_ref.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Solve: Tikhonov regularized --------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_svd_solve_regularized(Index m, Index n) {
|
||||
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||
|
||||
Mat A = Mat::Random(m, n);
|
||||
Mat B = Mat::Random(m, 1);
|
||||
RealScalar lambda = RealScalar(0.1);
|
||||
const Index k = (std::min)(m, n);
|
||||
|
||||
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
|
||||
Mat X_reg = svd.solve(B, lambda);
|
||||
|
||||
// CPU reference: D_ii = S_i / (S_i^2 + lambda^2).
|
||||
auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
|
||||
auto S = cpu_svd.singularValues();
|
||||
Mat U = cpu_svd.matrixU();
|
||||
Mat V = cpu_svd.matrixV();
|
||||
|
||||
Matrix<RealScalar, Dynamic, 1> D(k);
|
||||
for (Index i = 0; i < k; ++i) D(i) = S(i) / (S(i) * S(i) + lambda * lambda);
|
||||
Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
|
||||
|
||||
RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
|
||||
VERIFY((X_reg - X_ref).norm() / X_ref.norm() < tol);
|
||||
}
|
||||
|
||||
// ---- Empty matrix -----------------------------------------------------------
|
||||
|
||||
void test_svd_empty() {
|
||||
GpuSVD<double> svd(MatrixXd(0, 0), 0);
|
||||
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||
VERIFY_IS_EQUAL(svd.rows(), 0);
|
||||
VERIFY_IS_EQUAL(svd.cols(), 0);
|
||||
}
|
||||
|
||||
// ---- Per-scalar driver ------------------------------------------------------
|
||||
|
||||
template <typename Scalar>
|
||||
void test_scalar() {
|
||||
// Reconstruction + orthogonality (thin and full, identical test logic).
|
||||
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 64)));
|
||||
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(128, 64)));
|
||||
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 128))); // wide (m < n)
|
||||
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(64, 64)));
|
||||
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(128, 64)));
|
||||
|
||||
// Singular values.
|
||||
CALL_SUBTEST(test_svd_singular_values<Scalar>(64, 64));
|
||||
CALL_SUBTEST(test_svd_singular_values<Scalar>(128, 64));
|
||||
|
||||
// Solve.
|
||||
CALL_SUBTEST(test_svd_solve<Scalar>(64, 64, 4));
|
||||
CALL_SUBTEST(test_svd_solve<Scalar>(128, 64, 4));
|
||||
CALL_SUBTEST(test_svd_solve<Scalar>(64, 128, 4)); // wide (m < n)
|
||||
|
||||
// Truncated and regularized solve.
|
||||
CALL_SUBTEST(test_svd_solve_truncated<Scalar>(64, 64));
|
||||
CALL_SUBTEST(test_svd_solve_regularized<Scalar>(64, 64));
|
||||
}
|
||||
|
||||
EIGEN_DECLARE_TEST(gpu_cusolver_svd) {
|
||||
CALL_SUBTEST(test_scalar<float>());
|
||||
CALL_SUBTEST(test_scalar<double>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||
CALL_SUBTEST(test_svd_empty());
|
||||
}
|
||||
@@ -35,7 +35,6 @@ void test_allocate(Index rows, Index cols) {
|
||||
VERIFY(!dm.empty());
|
||||
VERIFY_IS_EQUAL(dm.rows(), rows);
|
||||
VERIFY_IS_EQUAL(dm.cols(), cols);
|
||||
VERIFY_IS_EQUAL(dm.outerStride(), rows);
|
||||
VERIFY(dm.data() != nullptr);
|
||||
VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar));
|
||||
}
|
||||
@@ -69,7 +68,7 @@ void test_roundtrip_async(Index rows, Index cols) {
|
||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream));
|
||||
|
||||
// Async upload from raw pointer.
|
||||
auto dm = DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, stream);
|
||||
auto dm = DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, stream);
|
||||
VERIFY_IS_EQUAL(dm.rows(), rows);
|
||||
VERIFY_IS_EQUAL(dm.cols(), cols);
|
||||
|
||||
@@ -185,7 +184,6 @@ void test_resize() {
|
||||
dm.resize(50, 30);
|
||||
VERIFY_IS_EQUAL(dm.rows(), 50);
|
||||
VERIFY_IS_EQUAL(dm.cols(), 30);
|
||||
VERIFY_IS_EQUAL(dm.outerStride(), 50);
|
||||
VERIFY(dm.data() != nullptr);
|
||||
|
||||
// Resize to same dimensions is a no-op.
|
||||
|
||||
Reference in New Issue
Block a user