mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Compare commits
2 Commits
gpu-librar
...
gpu-sparse
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
43a95b62bb | ||
|
|
8593c7f5a1 |
14
Eigen/GPU
14
Eigen/GPU
@@ -47,6 +47,20 @@
|
|||||||
#include "src/GPU/DeviceDispatch.h"
|
#include "src/GPU/DeviceDispatch.h"
|
||||||
#include "src/GPU/GpuLLT.h"
|
#include "src/GPU/GpuLLT.h"
|
||||||
#include "src/GPU/GpuLU.h"
|
#include "src/GPU/GpuLU.h"
|
||||||
|
#include "src/GPU/GpuQR.h"
|
||||||
|
#include "src/GPU/GpuSVD.h"
|
||||||
|
#include "src/GPU/GpuEigenSolver.h"
|
||||||
|
#include "src/GPU/CuFftSupport.h"
|
||||||
|
#include "src/GPU/GpuFFT.h"
|
||||||
|
#include "src/GPU/CuSparseSupport.h"
|
||||||
|
#include "src/GPU/GpuSparseContext.h"
|
||||||
|
#ifdef EIGEN_CUDSS
|
||||||
|
#include "src/GPU/CuDssSupport.h"
|
||||||
|
#include "src/GPU/GpuSparseSolverBase.h"
|
||||||
|
#include "src/GPU/GpuSparseLLT.h"
|
||||||
|
#include "src/GPU/GpuSparseLDLT.h"
|
||||||
|
#include "src/GPU/GpuSparseLU.h"
|
||||||
|
#endif
|
||||||
// IWYU pragma: end_exports
|
// IWYU pragma: end_exports
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
@@ -51,25 +51,69 @@ constexpr cublasOperation_t to_cublas_op(GpuOp op) {
|
|||||||
|
|
||||||
// ---- Scalar → cublasComputeType_t -------------------------------------------
|
// ---- Scalar → cublasComputeType_t -------------------------------------------
|
||||||
// cublasGemmEx requires a compute type (separate from the data type).
|
// cublasGemmEx requires a compute type (separate from the data type).
|
||||||
|
//
|
||||||
|
// Precision policy:
|
||||||
|
// - Default: tensor core algorithms enabled (CUBLAS_GEMM_DEFAULT_TENSOR_OP).
|
||||||
|
// For double, cuBLAS may use Ozaki emulation on sm_80+ tensor cores.
|
||||||
|
// - EIGEN_CUDA_TF32: opt-in to TF32 for float (~2x faster, 10-bit mantissa).
|
||||||
|
// - EIGEN_NO_CUDA_TENSOR_OPS: disables all tensor core usage. Uses pedantic
|
||||||
|
// compute types and CUBLAS_GEMM_DEFAULT algorithm. For bit-exact reproducibility.
|
||||||
|
|
||||||
template <typename Scalar>
|
template <typename Scalar>
|
||||||
struct cuda_compute_type;
|
struct cuda_compute_type;
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct cuda_compute_type<float> {
|
struct cuda_compute_type<float> {
|
||||||
|
#if defined(EIGEN_NO_CUDA_TENSOR_OPS)
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_PEDANTIC;
|
||||||
|
#elif defined(EIGEN_CUDA_TF32)
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_FAST_TF32;
|
||||||
|
#else
|
||||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
template <>
|
template <>
|
||||||
struct cuda_compute_type<double> {
|
struct cuda_compute_type<double> {
|
||||||
|
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F_PEDANTIC;
|
||||||
|
#else
|
||||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
template <>
|
template <>
|
||||||
struct cuda_compute_type<std::complex<float>> {
|
struct cuda_compute_type<std::complex<float>> {
|
||||||
|
#if defined(EIGEN_NO_CUDA_TENSOR_OPS)
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_PEDANTIC;
|
||||||
|
#elif defined(EIGEN_CUDA_TF32)
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_FAST_TF32;
|
||||||
|
#else
|
||||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
template <>
|
template <>
|
||||||
struct cuda_compute_type<std::complex<double>> {
|
struct cuda_compute_type<std::complex<double>> {
|
||||||
|
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F_PEDANTIC;
|
||||||
|
#else
|
||||||
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
// ---- GEMM algorithm hint ----------------------------------------------------
|
||||||
|
|
||||||
|
constexpr cublasGemmAlgo_t cuda_gemm_algo() {
|
||||||
|
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
|
||||||
|
return CUBLAS_GEMM_DEFAULT;
|
||||||
|
#else
|
||||||
|
return CUBLAS_GEMM_DEFAULT_TENSOR_OP;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Alpha/beta scalar type for cublasGemmEx --------------------------------
|
||||||
|
// For standard types, alpha/beta match the scalar type.
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct cuda_gemm_scalar {
|
||||||
|
using type = Scalar;
|
||||||
};
|
};
|
||||||
|
|
||||||
// ---- Type-specific cuBLAS wrappers ------------------------------------------
|
// ---- Type-specific cuBLAS wrappers ------------------------------------------
|
||||||
@@ -154,6 +198,35 @@ inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cubla
|
|||||||
reinterpret_cast<cuDoubleComplex*>(C), ldc);
|
reinterpret_cast<cuDoubleComplex*>(C), ldc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GEAM wrappers: C = alpha * op(A) + beta * op(B)
|
||||||
|
// Covers transpose, scale, matrix add/subtract in one call.
|
||||||
|
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||||
|
const float* alpha, const float* A, int lda, const float* beta, const float* B,
|
||||||
|
int ldb, float* C, int ldc) {
|
||||||
|
return cublasSgeam(h, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||||
|
const double* alpha, const double* A, int lda, const double* beta, const double* B,
|
||||||
|
int ldb, double* C, int ldc) {
|
||||||
|
return cublasDgeam(h, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||||
|
const std::complex<float>* alpha, const std::complex<float>* A, int lda,
|
||||||
|
const std::complex<float>* beta, const std::complex<float>* B, int ldb,
|
||||||
|
std::complex<float>* C, int ldc) {
|
||||||
|
return cublasCgeam(h, transa, transb, m, n, reinterpret_cast<const cuComplex*>(alpha),
|
||||||
|
reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<const cuComplex*>(beta),
|
||||||
|
reinterpret_cast<const cuComplex*>(B), ldb, reinterpret_cast<cuComplex*>(C), ldc);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||||
|
const std::complex<double>* alpha, const std::complex<double>* A, int lda,
|
||||||
|
const std::complex<double>* beta, const std::complex<double>* B, int ldb,
|
||||||
|
std::complex<double>* C, int ldc) {
|
||||||
|
return cublasZgeam(h, transa, transb, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
|
||||||
|
reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<const cuDoubleComplex*>(beta),
|
||||||
|
reinterpret_cast<const cuDoubleComplex*>(B), ldb, reinterpret_cast<cuDoubleComplex*>(C), ldc);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace internal
|
} // namespace internal
|
||||||
} // namespace Eigen
|
} // namespace Eigen
|
||||||
|
|
||||||
|
|||||||
134
Eigen/src/GPU/CuDssSupport.h
Normal file
134
Eigen/src/GPU/CuDssSupport.h
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// cuDSS support utilities: error checking macro, type mapping.
|
||||||
|
//
|
||||||
|
// cuDSS is NVIDIA's sparse direct solver library, supporting Cholesky (LL^T),
|
||||||
|
// LDL^T, and LU factorization on GPU. It requires CUDA 12.0+ and is
|
||||||
|
// distributed separately from the CUDA Toolkit.
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_CUDSS_SUPPORT_H
|
||||||
|
#define EIGEN_GPU_CUDSS_SUPPORT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSupport.h"
|
||||||
|
#include <cudss.h>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// ---- Error checking ---------------------------------------------------------
|
||||||
|
|
||||||
|
#define EIGEN_CUDSS_CHECK(x) \
|
||||||
|
do { \
|
||||||
|
cudssStatus_t _s = (x); \
|
||||||
|
eigen_assert(_s == CUDSS_STATUS_SUCCESS && "cuDSS call failed: " #x); \
|
||||||
|
EIGEN_UNUSED_VARIABLE(_s); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
// ---- Scalar → cudssMatrixType_t for SPD/HPD ---------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct cudss_spd_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cudss_spd_type<float> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_spd_type<double> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_spd_type<std::complex<float>> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_spd_type<std::complex<double>> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- Scalar → cudssMatrixType_t for symmetric/Hermitian ---------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct cudss_symmetric_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cudss_symmetric_type<float> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_symmetric_type<double> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_symmetric_type<std::complex<float>> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_symmetric_type<std::complex<double>> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- StorageIndex → cudaDataType_t ------------------------------------------
|
||||||
|
|
||||||
|
template <typename StorageIndex>
|
||||||
|
struct cudss_index_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cudss_index_type<int> {
|
||||||
|
static constexpr cudaDataType_t value = CUDA_R_32I;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_index_type<int64_t> {
|
||||||
|
static constexpr cudaDataType_t value = CUDA_R_64I;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- UpLo → cudssMatrixViewType_t -------------------------------------------
|
||||||
|
// For symmetric matrices stored as CSC (ColMajor), cuDSS sees CSR of A^T.
|
||||||
|
// Since A = A^T, the data is the same, but the triangle view must be swapped.
|
||||||
|
|
||||||
|
template <int UpLo, int StorageOrder>
|
||||||
|
struct cudss_view_type;
|
||||||
|
|
||||||
|
// ColMajor (CSC) passed as CSR: lower ↔ upper swap.
|
||||||
|
template <>
|
||||||
|
struct cudss_view_type<Lower, ColMajor> {
|
||||||
|
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_view_type<Upper, ColMajor> {
|
||||||
|
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
|
||||||
|
};
|
||||||
|
|
||||||
|
// RowMajor (CSR) passed directly: no swap needed.
|
||||||
|
template <>
|
||||||
|
struct cudss_view_type<Lower, RowMajor> {
|
||||||
|
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_view_type<Upper, RowMajor> {
|
||||||
|
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
|
||||||
|
// ---- Ordering enum ----------------------------------------------------------
|
||||||
|
|
||||||
|
enum class GpuSparseOrdering {
|
||||||
|
AMD, // Default fill-reducing ordering
|
||||||
|
METIS, // METIS nested dissection
|
||||||
|
RCM // Reverse Cuthill-McKee
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_CUDSS_SUPPORT_H
|
||||||
103
Eigen/src/GPU/CuFftSupport.h
Normal file
103
Eigen/src/GPU/CuFftSupport.h
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// cuFFT support utilities: error checking macro, type mapping.
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_CUFFT_SUPPORT_H
|
||||||
|
#define EIGEN_GPU_CUFFT_SUPPORT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSupport.h"
|
||||||
|
#include <cufft.h>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// ---- Error checking ---------------------------------------------------------
|
||||||
|
|
||||||
|
#define EIGEN_CUFFT_CHECK(x) \
|
||||||
|
do { \
|
||||||
|
cufftResult _r = (x); \
|
||||||
|
eigen_assert(_r == CUFFT_SUCCESS && "cuFFT call failed: " #x); \
|
||||||
|
EIGEN_UNUSED_VARIABLE(_r); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
// ---- Scalar → cufftType traits ----------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct cufft_c2c_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cufft_c2c_type<float> {
|
||||||
|
static constexpr cufftType value = CUFFT_C2C;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cufft_c2c_type<double> {
|
||||||
|
static constexpr cufftType value = CUFFT_Z2Z;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct cufft_r2c_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cufft_r2c_type<float> {
|
||||||
|
static constexpr cufftType value = CUFFT_R2C;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cufft_r2c_type<double> {
|
||||||
|
static constexpr cufftType value = CUFFT_D2Z;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct cufft_c2r_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cufft_c2r_type<float> {
|
||||||
|
static constexpr cufftType value = CUFFT_C2R;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cufft_c2r_type<double> {
|
||||||
|
static constexpr cufftType value = CUFFT_Z2D;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- Type-dispatched cuFFT execution ----------------------------------------
|
||||||
|
|
||||||
|
// C2C
|
||||||
|
inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<float>* in, std::complex<float>* out,
|
||||||
|
int direction) {
|
||||||
|
return cufftExecC2C(plan, reinterpret_cast<cufftComplex*>(in), reinterpret_cast<cufftComplex*>(out), direction);
|
||||||
|
}
|
||||||
|
inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<double>* in, std::complex<double>* out,
|
||||||
|
int direction) {
|
||||||
|
return cufftExecZ2Z(plan, reinterpret_cast<cufftDoubleComplex*>(in), reinterpret_cast<cufftDoubleComplex*>(out),
|
||||||
|
direction);
|
||||||
|
}
|
||||||
|
|
||||||
|
// R2C
|
||||||
|
inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, float* in, std::complex<float>* out) {
|
||||||
|
return cufftExecR2C(plan, in, reinterpret_cast<cufftComplex*>(out));
|
||||||
|
}
|
||||||
|
inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, double* in, std::complex<double>* out) {
|
||||||
|
return cufftExecD2Z(plan, in, reinterpret_cast<cufftDoubleComplex*>(out));
|
||||||
|
}
|
||||||
|
|
||||||
|
// C2R
|
||||||
|
inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<float>* in, float* out) {
|
||||||
|
return cufftExecC2R(plan, reinterpret_cast<cufftComplex*>(in), out);
|
||||||
|
}
|
||||||
|
inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<double>* in, double* out) {
|
||||||
|
return cufftExecZ2D(plan, reinterpret_cast<cufftDoubleComplex*>(in), out);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_CUFFT_SUPPORT_H
|
||||||
@@ -91,6 +91,68 @@ struct cusolver_fill_mode<Upper, RowMajor> {
|
|||||||
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
|
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// ---- Type-specific cuSOLVER wrappers ----------------------------------------
|
||||||
|
// cuSOLVER does not provide generic X variants for ormqr/unmqr. These overloaded
|
||||||
|
// wrappers dispatch to the correct type-specific function.
|
||||||
|
// For real types: ormqr (orthogonal Q). For complex types: unmqr (unitary Q).
|
||||||
|
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||||
|
int n, int k, const float* A, int lda, const float* tau, float* C, int ldc,
|
||||||
|
float* work, int lwork, int* info) {
|
||||||
|
return cusolverDnSormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
|
||||||
|
}
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||||
|
int n, int k, const double* A, int lda, const double* tau, double* C, int ldc,
|
||||||
|
double* work, int lwork, int* info) {
|
||||||
|
return cusolverDnDormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
|
||||||
|
}
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||||
|
int n, int k, const std::complex<float>* A, int lda,
|
||||||
|
const std::complex<float>* tau, std::complex<float>* C, int ldc,
|
||||||
|
std::complex<float>* work, int lwork, int* info) {
|
||||||
|
return cusolverDnCunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
|
||||||
|
reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<cuComplex*>(C), ldc,
|
||||||
|
reinterpret_cast<cuComplex*>(work), lwork, info);
|
||||||
|
}
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||||
|
int n, int k, const std::complex<double>* A, int lda,
|
||||||
|
const std::complex<double>* tau, std::complex<double>* C, int ldc,
|
||||||
|
std::complex<double>* work, int lwork, int* info) {
|
||||||
|
return cusolverDnZunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
|
||||||
|
reinterpret_cast<const cuDoubleComplex*>(tau), reinterpret_cast<cuDoubleComplex*>(C), ldc,
|
||||||
|
reinterpret_cast<cuDoubleComplex*>(work), lwork, info);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Buffer size wrappers for ormqr/unmqr.
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||||
|
cublasOperation_t trans, int m, int n, int k, const float* A,
|
||||||
|
int lda, const float* tau, const float* C, int ldc, int* lwork) {
|
||||||
|
return cusolverDnSormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
|
||||||
|
}
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||||
|
cublasOperation_t trans, int m, int n, int k, const double* A,
|
||||||
|
int lda, const double* tau, const double* C, int ldc, int* lwork) {
|
||||||
|
return cusolverDnDormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
|
||||||
|
}
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||||
|
cublasOperation_t trans, int m, int n, int k,
|
||||||
|
const std::complex<float>* A, int lda,
|
||||||
|
const std::complex<float>* tau, const std::complex<float>* C,
|
||||||
|
int ldc, int* lwork) {
|
||||||
|
return cusolverDnCunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
|
||||||
|
reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<const cuComplex*>(C),
|
||||||
|
ldc, lwork);
|
||||||
|
}
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||||
|
cublasOperation_t trans, int m, int n, int k,
|
||||||
|
const std::complex<double>* A, int lda,
|
||||||
|
const std::complex<double>* tau, const std::complex<double>* C,
|
||||||
|
int ldc, int* lwork) {
|
||||||
|
return cusolverDnZunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
|
||||||
|
reinterpret_cast<const cuDoubleComplex*>(tau),
|
||||||
|
reinterpret_cast<const cuDoubleComplex*>(C), ldc, lwork);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace internal
|
} // namespace internal
|
||||||
} // namespace Eigen
|
} // namespace Eigen
|
||||||
|
|
||||||
|
|||||||
34
Eigen/src/GPU/CuSparseSupport.h
Normal file
34
Eigen/src/GPU/CuSparseSupport.h
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// cuSPARSE support utilities: error checking macro.
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_CUSPARSE_SUPPORT_H
|
||||||
|
#define EIGEN_GPU_CUSPARSE_SUPPORT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSupport.h"
|
||||||
|
#include <cusparse.h>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
#define EIGEN_CUSPARSE_CHECK(x) \
|
||||||
|
do { \
|
||||||
|
cusparseStatus_t _s = (x); \
|
||||||
|
eigen_assert(_s == CUSPARSE_STATUS_SUCCESS && "cuSPARSE call failed: " #x); \
|
||||||
|
EIGEN_UNUSED_VARIABLE(_s); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_CUSPARSE_SUPPORT_H
|
||||||
@@ -58,8 +58,8 @@ void dispatch_gemm(
|
|||||||
|
|
||||||
eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch");
|
eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch");
|
||||||
|
|
||||||
const int64_t lda = A.outerStride();
|
const int64_t lda = A.rows();
|
||||||
const int64_t ldb = B.outerStride();
|
const int64_t ldb = B.rows();
|
||||||
|
|
||||||
// Serialize all accesses to the destination buffer on this stream.
|
// Serialize all accesses to the destination buffer on this stream.
|
||||||
if (!dst.empty()) {
|
if (!dst.empty()) {
|
||||||
@@ -71,9 +71,13 @@ void dispatch_gemm(
|
|||||||
if (resized) {
|
if (resized) {
|
||||||
dst.resize(m, n);
|
dst.resize(m, n);
|
||||||
}
|
}
|
||||||
const int64_t ldc = dst.outerStride();
|
const int64_t ldc = dst.rows();
|
||||||
|
|
||||||
Scalar alpha_val = alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs());
|
// cuBLAS requires alpha/beta as float for half/bfloat16 inputs.
|
||||||
|
using GemmScalar = typename cuda_gemm_scalar<Scalar>::type;
|
||||||
|
GemmScalar alpha_gval =
|
||||||
|
static_cast<GemmScalar>(alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs()));
|
||||||
|
GemmScalar beta_gval = static_cast<GemmScalar>(beta_val);
|
||||||
|
|
||||||
// Wait for operands to be ready on this stream.
|
// Wait for operands to be ready on this stream.
|
||||||
A.waitReady(ctx.stream());
|
A.waitReady(ctx.stream());
|
||||||
@@ -81,7 +85,7 @@ void dispatch_gemm(
|
|||||||
|
|
||||||
// If there is no existing valid destination to accumulate into, treat it as
|
// If there is no existing valid destination to accumulate into, treat it as
|
||||||
// zero rather than reading uninitialized memory.
|
// zero rather than reading uninitialized memory.
|
||||||
if (resized && beta_val != Scalar(0) && dst.sizeInBytes() > 0) {
|
if (resized && beta_gval != GemmScalar(0) && dst.sizeInBytes() > 0) {
|
||||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -89,9 +93,9 @@ void dispatch_gemm(
|
|||||||
constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
|
constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
|
||||||
|
|
||||||
EIGEN_CUBLAS_CHECK(cublasGemmEx(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n),
|
EIGEN_CUBLAS_CHECK(cublasGemmEx(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n),
|
||||||
static_cast<int>(k), &alpha_val, A.data(), dtype, static_cast<int>(lda), B.data(),
|
static_cast<int>(k), &alpha_gval, A.data(), dtype, static_cast<int>(lda), B.data(),
|
||||||
dtype, static_cast<int>(ldb), &beta_val, dst.data(), dtype, static_cast<int>(ldc),
|
dtype, static_cast<int>(ldb), &beta_gval, dst.data(), dtype, static_cast<int>(ldc),
|
||||||
compute, CUBLAS_GEMM_DEFAULT));
|
compute, cuda_gemm_algo()));
|
||||||
|
|
||||||
dst.recordReady(ctx.stream());
|
dst.recordReady(ctx.stream());
|
||||||
}
|
}
|
||||||
@@ -125,9 +129,9 @@ void dispatch_llt_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LltSol
|
|||||||
|
|
||||||
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||||
constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo, ColMajor>::value;
|
constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo, ColMajor>::value;
|
||||||
const int64_t lda = static_cast<int64_t>(A.outerStride());
|
const int64_t lda = static_cast<int64_t>(A.rows());
|
||||||
const int64_t ldb = static_cast<int64_t>(B.outerStride());
|
const int64_t ldb = static_cast<int64_t>(B.rows());
|
||||||
eigen_assert(ldb == static_cast<int64_t>(B.rows()) && "DeviceMatrix must be densely packed");
|
|
||||||
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
|
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||||
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||||
|
|
||||||
@@ -163,7 +167,7 @@ void dispatch_llt_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LltSol
|
|||||||
// Solve.
|
// Solve.
|
||||||
DeviceBuffer d_solve_info(sizeof(int));
|
DeviceBuffer d_solve_info(sizeof(int));
|
||||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), nrhs, dtype,
|
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), nrhs, dtype,
|
||||||
d_factor.ptr, lda, dtype, dst.data(), static_cast<int64_t>(dst.outerStride()),
|
d_factor.ptr, lda, dtype, dst.data(), static_cast<int64_t>(dst.rows()),
|
||||||
static_cast<int*>(d_solve_info.ptr)));
|
static_cast<int*>(d_solve_info.ptr)));
|
||||||
|
|
||||||
// Sync to ensure workspace locals can be freed safely.
|
// Sync to ensure workspace locals can be freed safely.
|
||||||
@@ -201,9 +205,9 @@ void dispatch_lu_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LuSolve
|
|||||||
if (!dst.empty()) dst.waitReady(ctx.stream());
|
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||||
|
|
||||||
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||||
const int64_t lda = static_cast<int64_t>(A.outerStride());
|
const int64_t lda = static_cast<int64_t>(A.rows());
|
||||||
const int64_t ldb = static_cast<int64_t>(B.outerStride());
|
const int64_t ldb = static_cast<int64_t>(B.rows());
|
||||||
eigen_assert(ldb == static_cast<int64_t>(B.rows()) && "DeviceMatrix must be densely packed");
|
|
||||||
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
|
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||||
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||||
const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t);
|
const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t);
|
||||||
@@ -245,7 +249,7 @@ void dispatch_lu_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LuSolve
|
|||||||
DeviceBuffer d_solve_info(sizeof(int));
|
DeviceBuffer d_solve_info(sizeof(int));
|
||||||
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, static_cast<int64_t>(n), nrhs,
|
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, static_cast<int64_t>(n), nrhs,
|
||||||
dtype, d_lu.ptr, lda, static_cast<const int64_t*>(d_ipiv.ptr), dtype,
|
dtype, d_lu.ptr, lda, static_cast<const int64_t*>(d_ipiv.ptr), dtype,
|
||||||
dst.data(), static_cast<int64_t>(dst.outerStride()),
|
dst.data(), static_cast<int64_t>(dst.rows()),
|
||||||
static_cast<int*>(d_solve_info.ptr)));
|
static_cast<int*>(d_solve_info.ptr)));
|
||||||
|
|
||||||
// Sync to ensure workspace locals can be freed safely.
|
// Sync to ensure workspace locals can be freed safely.
|
||||||
@@ -285,15 +289,15 @@ void dispatch_trsm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const TrsmExpr<Sc
|
|||||||
|
|
||||||
// D2D copy B → dst (trsm is in-place on the RHS).
|
// D2D copy B → dst (trsm is in-place on the RHS).
|
||||||
dst.resize(n, B.cols());
|
dst.resize(n, B.cols());
|
||||||
const size_t rhs_bytes = static_cast<size_t>(dst.outerStride()) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
const size_t rhs_bytes = static_cast<size_t>(dst.rows()) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||||
|
|
||||||
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
||||||
Scalar alpha(1);
|
Scalar alpha(1);
|
||||||
|
|
||||||
EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs,
|
EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs,
|
||||||
&alpha, A.data(), static_cast<int>(A.outerStride()), dst.data(),
|
&alpha, A.data(), static_cast<int>(A.rows()), dst.data(),
|
||||||
static_cast<int>(dst.outerStride())));
|
static_cast<int>(dst.rows())));
|
||||||
|
|
||||||
dst.recordReady(ctx.stream());
|
dst.recordReady(ctx.stream());
|
||||||
}
|
}
|
||||||
@@ -329,8 +333,8 @@ void dispatch_symm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SymmExpr<Sc
|
|||||||
Scalar alpha(1), beta(0);
|
Scalar alpha(1), beta(0);
|
||||||
|
|
||||||
EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &alpha, A.data(),
|
EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &alpha, A.data(),
|
||||||
static_cast<int>(A.outerStride()), B.data(), static_cast<int>(B.outerStride()), &beta,
|
static_cast<int>(A.rows()), B.data(), static_cast<int>(B.rows()), &beta, dst.data(),
|
||||||
dst.data(), static_cast<int>(dst.outerStride())));
|
static_cast<int>(dst.rows())));
|
||||||
|
|
||||||
dst.recordReady(ctx.stream());
|
dst.recordReady(ctx.stream());
|
||||||
}
|
}
|
||||||
@@ -367,8 +371,7 @@ void dispatch_syrk(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SyrkExpr<Sc
|
|||||||
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
||||||
|
|
||||||
EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(),
|
EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(),
|
||||||
static_cast<int>(A.outerStride()), &beta_val, dst.data(),
|
static_cast<int>(A.rows()), &beta_val, dst.data(), static_cast<int>(dst.rows())));
|
||||||
static_cast<int>(dst.outerStride())));
|
|
||||||
|
|
||||||
dst.recordReady(ctx.stream());
|
dst.recordReady(ctx.stream());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
// Typed RAII wrapper for a dense matrix in GPU device memory.
|
// Typed RAII wrapper for a dense matrix in GPU device memory.
|
||||||
//
|
//
|
||||||
// DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked
|
// DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked
|
||||||
// dimensions and leading dimension. It can be passed to GPU solvers
|
// dimensions. Always dense (leading dimension = rows). It can be passed to GPU solvers
|
||||||
// (GpuLLT, GpuLU, future cuBLAS/cuDSS) without host round-trips.
|
// (GpuLLT, GpuLU, future cuBLAS/cuDSS) without host round-trips.
|
||||||
//
|
//
|
||||||
// Cross-stream safety is automatic: an internal CUDA event tracks when the
|
// Cross-stream safety is automatic: an internal CUDA event tracks when the
|
||||||
@@ -25,7 +25,7 @@
|
|||||||
// MatrixXd X = d_X.toHost(); // download + block
|
// MatrixXd X = d_X.toHost(); // download + block
|
||||||
//
|
//
|
||||||
// Async variants:
|
// Async variants:
|
||||||
// auto d_A = DeviceMatrix<double>::fromHostAsync(A.data(), n, n, n, stream);
|
// auto d_A = DeviceMatrix<double>::fromHostAsync(A.data(), n, n, stream);
|
||||||
// auto transfer = d_X.toHostAsync(stream); // enqueue D2H
|
// auto transfer = d_X.toHostAsync(stream); // enqueue D2H
|
||||||
// // ... overlap with other work ...
|
// // ... overlap with other work ...
|
||||||
// MatrixXd X = transfer.get(); // block + retrieve
|
// MatrixXd X = transfer.get(); // block + retrieve
|
||||||
@@ -157,7 +157,8 @@ class HostTransfer {
|
|||||||
*
|
*
|
||||||
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
|
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
|
||||||
*
|
*
|
||||||
* Owns a device allocation with tracked dimensions and leading dimension.
|
* Owns a device allocation with tracked dimensions. Always dense
|
||||||
|
* (leading dimension = rows; no stride padding).
|
||||||
* An internal CUDA event records when the data was last written, enabling
|
* An internal CUDA event records when the data was last written, enabling
|
||||||
* safe cross-stream consumption without user-visible synchronization.
|
* safe cross-stream consumption without user-visible synchronization.
|
||||||
*
|
*
|
||||||
@@ -177,7 +178,7 @@ class DeviceMatrix {
|
|||||||
DeviceMatrix() = default;
|
DeviceMatrix() = default;
|
||||||
|
|
||||||
/** Allocate uninitialized device memory for a rows x cols matrix. */
|
/** Allocate uninitialized device memory for a rows x cols matrix. */
|
||||||
DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols), outerStride_(rows) {
|
DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols) {
|
||||||
eigen_assert(rows >= 0 && cols >= 0);
|
eigen_assert(rows >= 0 && cols >= 0);
|
||||||
size_t bytes = sizeInBytes();
|
size_t bytes = sizeInBytes();
|
||||||
if (bytes > 0) {
|
if (bytes > 0) {
|
||||||
@@ -196,14 +197,12 @@ class DeviceMatrix {
|
|||||||
: data_(o.data_),
|
: data_(o.data_),
|
||||||
rows_(o.rows_),
|
rows_(o.rows_),
|
||||||
cols_(o.cols_),
|
cols_(o.cols_),
|
||||||
outerStride_(o.outerStride_),
|
|
||||||
ready_event_(o.ready_event_),
|
ready_event_(o.ready_event_),
|
||||||
ready_stream_(o.ready_stream_),
|
ready_stream_(o.ready_stream_),
|
||||||
retained_buffer_(std::move(o.retained_buffer_)) {
|
retained_buffer_(std::move(o.retained_buffer_)) {
|
||||||
o.data_ = nullptr;
|
o.data_ = nullptr;
|
||||||
o.rows_ = 0;
|
o.rows_ = 0;
|
||||||
o.cols_ = 0;
|
o.cols_ = 0;
|
||||||
o.outerStride_ = 0;
|
|
||||||
o.ready_event_ = nullptr;
|
o.ready_event_ = nullptr;
|
||||||
o.ready_stream_ = nullptr;
|
o.ready_stream_ = nullptr;
|
||||||
}
|
}
|
||||||
@@ -215,14 +214,12 @@ class DeviceMatrix {
|
|||||||
data_ = o.data_;
|
data_ = o.data_;
|
||||||
rows_ = o.rows_;
|
rows_ = o.rows_;
|
||||||
cols_ = o.cols_;
|
cols_ = o.cols_;
|
||||||
outerStride_ = o.outerStride_;
|
|
||||||
ready_event_ = o.ready_event_;
|
ready_event_ = o.ready_event_;
|
||||||
ready_stream_ = o.ready_stream_;
|
ready_stream_ = o.ready_stream_;
|
||||||
retained_buffer_ = std::move(o.retained_buffer_);
|
retained_buffer_ = std::move(o.retained_buffer_);
|
||||||
o.data_ = nullptr;
|
o.data_ = nullptr;
|
||||||
o.rows_ = 0;
|
o.rows_ = 0;
|
||||||
o.cols_ = 0;
|
o.cols_ = 0;
|
||||||
o.outerStride_ = 0;
|
|
||||||
o.ready_event_ = nullptr;
|
o.ready_event_ = nullptr;
|
||||||
o.ready_stream_ = nullptr;
|
o.ready_stream_ = nullptr;
|
||||||
}
|
}
|
||||||
@@ -259,29 +256,17 @@ class DeviceMatrix {
|
|||||||
* The caller must keep \p host_data alive until the transfer completes
|
* The caller must keep \p host_data alive until the transfer completes
|
||||||
* (check via the internal event or synchronize the stream).
|
* (check via the internal event or synchronize the stream).
|
||||||
*
|
*
|
||||||
* \param host_data Pointer to contiguous column-major host data.
|
* \param host_data Pointer to contiguous column-major host data.
|
||||||
* \param rows Number of rows.
|
* \param rows Number of rows.
|
||||||
* \param cols Number of columns.
|
* \param cols Number of columns.
|
||||||
* \param outerStride Leading dimension (>= rows). Use rows for dense.
|
* \param stream CUDA stream for the transfer.
|
||||||
* \param stream CUDA stream for the transfer.
|
|
||||||
*/
|
*/
|
||||||
static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, Index outerStride,
|
static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, cudaStream_t stream) {
|
||||||
cudaStream_t stream) {
|
eigen_assert(rows >= 0 && cols >= 0);
|
||||||
eigen_assert(rows >= 0 && cols >= 0 && outerStride >= rows);
|
|
||||||
eigen_assert(host_data != nullptr || (rows == 0 || cols == 0));
|
eigen_assert(host_data != nullptr || (rows == 0 || cols == 0));
|
||||||
DeviceMatrix dm(rows, cols);
|
DeviceMatrix dm(rows, cols);
|
||||||
if (dm.sizeInBytes() > 0) {
|
if (dm.sizeInBytes() > 0) {
|
||||||
// If outerStride == rows (dense), single contiguous copy.
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dm.data_, host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
|
||||||
// Otherwise, copy column by column (strided layout).
|
|
||||||
if (outerStride == rows) {
|
|
||||||
EIGEN_CUDA_RUNTIME_CHECK(
|
|
||||||
cudaMemcpyAsync(dm.data_, host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
|
|
||||||
} else {
|
|
||||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(dm.data_, static_cast<size_t>(rows) * sizeof(Scalar), host_data,
|
|
||||||
static_cast<size_t>(outerStride) * sizeof(Scalar),
|
|
||||||
static_cast<size_t>(rows) * sizeof(Scalar),
|
|
||||||
static_cast<size_t>(cols), cudaMemcpyHostToDevice, stream));
|
|
||||||
}
|
|
||||||
dm.recordReady(stream);
|
dm.recordReady(stream);
|
||||||
}
|
}
|
||||||
return dm;
|
return dm;
|
||||||
@@ -360,7 +345,6 @@ class DeviceMatrix {
|
|||||||
retained_buffer_ = internal::DeviceBuffer();
|
retained_buffer_ = internal::DeviceBuffer();
|
||||||
rows_ = rows;
|
rows_ = rows;
|
||||||
cols_ = cols;
|
cols_ = cols;
|
||||||
outerStride_ = rows;
|
|
||||||
size_t bytes = sizeInBytes();
|
size_t bytes = sizeInBytes();
|
||||||
if (bytes > 0) {
|
if (bytes > 0) {
|
||||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
|
||||||
@@ -373,11 +357,10 @@ class DeviceMatrix {
|
|||||||
const Scalar* data() const { return data_; }
|
const Scalar* data() const { return data_; }
|
||||||
Index rows() const { return rows_; }
|
Index rows() const { return rows_; }
|
||||||
Index cols() const { return cols_; }
|
Index cols() const { return cols_; }
|
||||||
Index outerStride() const { return outerStride_; }
|
|
||||||
bool empty() const { return rows_ == 0 || cols_ == 0; }
|
bool empty() const { return rows_ == 0 || cols_ == 0; }
|
||||||
|
|
||||||
/** Size of the device allocation in bytes. */
|
/** Size of the device allocation in bytes. */
|
||||||
size_t sizeInBytes() const { return static_cast<size_t>(outerStride_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
|
size_t sizeInBytes() const { return static_cast<size_t>(rows_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
|
||||||
|
|
||||||
// ---- Event synchronization (public for library dispatch interop) ---------
|
// ---- Event synchronization (public for library dispatch interop) ---------
|
||||||
|
|
||||||
@@ -466,8 +449,7 @@ class DeviceMatrix {
|
|||||||
private:
|
private:
|
||||||
// ---- Private: adopt a raw device pointer (used by friend solvers) --------
|
// ---- Private: adopt a raw device pointer (used by friend solvers) --------
|
||||||
|
|
||||||
DeviceMatrix(Scalar* device_ptr, Index rows, Index cols, Index outerStride)
|
DeviceMatrix(Scalar* device_ptr, Index rows, Index cols) : data_(device_ptr), rows_(rows), cols_(cols) {}
|
||||||
: data_(device_ptr), rows_(rows), cols_(cols), outerStride_(outerStride) {}
|
|
||||||
|
|
||||||
/** Transfer ownership of the device pointer out. Zeros internal state. */
|
/** Transfer ownership of the device pointer out. Zeros internal state. */
|
||||||
Scalar* release() {
|
Scalar* release() {
|
||||||
@@ -475,7 +457,6 @@ class DeviceMatrix {
|
|||||||
data_ = nullptr;
|
data_ = nullptr;
|
||||||
rows_ = 0;
|
rows_ = 0;
|
||||||
cols_ = 0;
|
cols_ = 0;
|
||||||
outerStride_ = 0;
|
|
||||||
if (ready_event_) {
|
if (ready_event_) {
|
||||||
(void)cudaEventDestroy(ready_event_);
|
(void)cudaEventDestroy(ready_event_);
|
||||||
ready_event_ = nullptr;
|
ready_event_ = nullptr;
|
||||||
@@ -500,13 +481,18 @@ class DeviceMatrix {
|
|||||||
friend class GpuLLT;
|
friend class GpuLLT;
|
||||||
template <typename>
|
template <typename>
|
||||||
friend class GpuLU;
|
friend class GpuLU;
|
||||||
|
template <typename>
|
||||||
|
friend class GpuQR;
|
||||||
|
template <typename>
|
||||||
|
friend class GpuSVD;
|
||||||
|
template <typename>
|
||||||
|
friend class GpuSelfAdjointEigenSolver;
|
||||||
|
|
||||||
// ---- Data members --------------------------------------------------------
|
// ---- Data members --------------------------------------------------------
|
||||||
|
|
||||||
Scalar* data_ = nullptr;
|
Scalar* data_ = nullptr;
|
||||||
Index rows_ = 0;
|
Index rows_ = 0;
|
||||||
Index cols_ = 0;
|
Index cols_ = 0;
|
||||||
Index outerStride_ = 0;
|
|
||||||
cudaEvent_t ready_event_ = nullptr; // internal: tracks last write completion
|
cudaEvent_t ready_event_ = nullptr; // internal: tracks last write completion
|
||||||
cudaStream_t ready_stream_ = nullptr; // stream that recorded ready_event_ (for same-stream skip)
|
cudaStream_t ready_stream_ = nullptr; // stream that recorded ready_event_ (for same-stream skip)
|
||||||
internal::DeviceBuffer retained_buffer_; // internal: keeps async aux buffers alive
|
internal::DeviceBuffer retained_buffer_; // internal: keeps async aux buffers alive
|
||||||
|
|||||||
232
Eigen/src/GPU/GpuEigenSolver.h
Normal file
232
Eigen/src/GPU/GpuEigenSolver.h
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU self-adjoint eigenvalue decomposition using cuSOLVER.
|
||||||
|
//
|
||||||
|
// Wraps cusolverDnXsyevd (symmetric/Hermitian divide-and-conquer).
|
||||||
|
// Stores eigenvalues and eigenvectors on device.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuSelfAdjointEigenSolver<double> es(A);
|
||||||
|
// VectorXd eigenvals = es.eigenvalues();
|
||||||
|
// MatrixXd eigenvecs = es.eigenvectors();
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_EIGENSOLVER_H
|
||||||
|
#define EIGEN_GPU_EIGENSOLVER_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuSolverSupport.h"
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class GpuSelfAdjointEigenSolver {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||||
|
using RealVector = Matrix<RealScalar, Dynamic, 1>;
|
||||||
|
|
||||||
|
/** Eigenvalue-only or eigenvalues + eigenvectors. */
|
||||||
|
enum ComputeMode { EigenvaluesOnly, ComputeEigenvectors };
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver() { init_context(); }
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
explicit GpuSelfAdjointEigenSolver(const EigenBase<InputType>& A, ComputeMode mode = ComputeEigenvectors) {
|
||||||
|
init_context();
|
||||||
|
compute(A, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
~GpuSelfAdjointEigenSolver() {
|
||||||
|
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver(const GpuSelfAdjointEigenSolver&) = delete;
|
||||||
|
GpuSelfAdjointEigenSolver& operator=(const GpuSelfAdjointEigenSolver&) = delete;
|
||||||
|
|
||||||
|
// ---- Factorization -------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
GpuSelfAdjointEigenSolver& compute(const EigenBase<InputType>& A, ComputeMode mode = ComputeEigenvectors) {
|
||||||
|
eigen_assert(A.rows() == A.cols() && "GpuSelfAdjointEigenSolver requires a square matrix");
|
||||||
|
mode_ = mode;
|
||||||
|
n_ = A.rows();
|
||||||
|
info_ = InvalidInput;
|
||||||
|
info_synced_ = false;
|
||||||
|
|
||||||
|
if (n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
info_synced_ = true;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
const PlainMatrix mat(A.derived());
|
||||||
|
lda_ = static_cast<int64_t>(n_);
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||||
|
|
||||||
|
// syevd overwrites A with eigenvectors (if requested).
|
||||||
|
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver& compute(const DeviceMatrix<Scalar>& d_A, ComputeMode mode = ComputeEigenvectors) {
|
||||||
|
eigen_assert(d_A.rows() == d_A.cols() && "GpuSelfAdjointEigenSolver requires a square matrix");
|
||||||
|
mode_ = mode;
|
||||||
|
n_ = d_A.rows();
|
||||||
|
info_ = InvalidInput;
|
||||||
|
info_synced_ = false;
|
||||||
|
|
||||||
|
if (n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
info_synced_ = true;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
d_A.waitReady(stream_);
|
||||||
|
lda_ = static_cast<int64_t>(n_);
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||||
|
|
||||||
|
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors -----------------------------------------------------------
|
||||||
|
|
||||||
|
ComputationInfo info() const {
|
||||||
|
sync_info();
|
||||||
|
return info_;
|
||||||
|
}
|
||||||
|
|
||||||
|
Index cols() const { return n_; }
|
||||||
|
Index rows() const { return n_; }
|
||||||
|
|
||||||
|
// TODO: Add device-side accessors (deviceEigenvalues(), deviceEigenvectors())
|
||||||
|
// returning DeviceMatrix views of the internal buffers, so users can chain
|
||||||
|
// GPU operations without round-tripping through host memory.
|
||||||
|
|
||||||
|
/** Eigenvalues in ascending order. Downloads from device. */
|
||||||
|
RealVector eigenvalues() const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success);
|
||||||
|
RealVector W(n_);
|
||||||
|
if (n_ > 0) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpy(W.data(), d_W_.ptr, static_cast<size_t>(n_) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
|
||||||
|
}
|
||||||
|
return W;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Eigenvectors (columns). Downloads from device.
|
||||||
|
* Requires ComputeEigenvectors mode. */
|
||||||
|
PlainMatrix eigenvectors() const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success);
|
||||||
|
eigen_assert(mode_ == ComputeEigenvectors && "eigenvectors() requires ComputeEigenvectors mode");
|
||||||
|
PlainMatrix V(n_, n_);
|
||||||
|
if (n_ > 0) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(V.data(), d_A_.ptr,
|
||||||
|
static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
}
|
||||||
|
return V;
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cusolverDnHandle_t handle_ = nullptr;
|
||||||
|
internal::CusolverParams params_;
|
||||||
|
internal::DeviceBuffer d_A_; // overwritten with eigenvectors by syevd
|
||||||
|
internal::DeviceBuffer d_W_; // eigenvalues (RealScalar, length n)
|
||||||
|
internal::DeviceBuffer d_scratch_; // workspace + info
|
||||||
|
size_t scratch_size_ = 0;
|
||||||
|
std::vector<char> h_workspace_;
|
||||||
|
ComputeMode mode_ = ComputeEigenvectors;
|
||||||
|
Index n_ = 0;
|
||||||
|
int64_t lda_ = 0;
|
||||||
|
ComputationInfo info_ = InvalidInput;
|
||||||
|
int info_word_ = 0;
|
||||||
|
bool info_synced_ = true;
|
||||||
|
|
||||||
|
void init_context() {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||||
|
ensure_scratch(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ensure_scratch(size_t workspace_bytes) {
|
||||||
|
constexpr size_t kAlign = 16;
|
||||||
|
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||||
|
size_t needed = workspace_bytes + sizeof(int);
|
||||||
|
if (needed > scratch_size_) {
|
||||||
|
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
d_scratch_ = internal::DeviceBuffer(needed);
|
||||||
|
scratch_size_ = needed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||||
|
int* scratch_info() const {
|
||||||
|
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||||
|
}
|
||||||
|
|
||||||
|
void sync_info() const {
|
||||||
|
if (!info_synced_) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
const_cast<GpuSelfAdjointEigenSolver*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||||
|
const_cast<GpuSelfAdjointEigenSolver*>(this)->info_synced_ = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void factorize() {
|
||||||
|
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||||
|
constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
|
||||||
|
|
||||||
|
info_synced_ = false;
|
||||||
|
info_ = InvalidInput;
|
||||||
|
|
||||||
|
d_W_ = internal::DeviceBuffer(static_cast<size_t>(n_) * sizeof(RealScalar));
|
||||||
|
|
||||||
|
const cusolverEigMode_t jobz =
|
||||||
|
(mode_ == ComputeEigenvectors) ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
|
||||||
|
|
||||||
|
// Use lower triangle (standard convention).
|
||||||
|
constexpr cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
|
||||||
|
|
||||||
|
size_t dev_ws = 0, host_ws = 0;
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd_bufferSize(handle_, params_.p, jobz, uplo, static_cast<int64_t>(n_), dtype,
|
||||||
|
d_A_.ptr, lda_, rtype, d_W_.ptr, dtype, &dev_ws, &host_ws));
|
||||||
|
|
||||||
|
ensure_scratch(dev_ws);
|
||||||
|
h_workspace_.resize(host_ws);
|
||||||
|
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd(handle_, params_.p, jobz, uplo, static_cast<int64_t>(n_), dtype, d_A_.ptr,
|
||||||
|
lda_, rtype, d_W_.ptr, dtype, scratch_workspace(), dev_ws,
|
||||||
|
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_EIGENSOLVER_H
|
||||||
308
Eigen/src/GPU/GpuFFT.h
Normal file
308
Eigen/src/GPU/GpuFFT.h
Normal file
@@ -0,0 +1,308 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU FFT via cuFFT.
|
||||||
|
//
|
||||||
|
// Standalone GPU FFT class with plan caching. Supports 1D and 2D transforms:
|
||||||
|
// C2C (complex-to-complex), R2C (real-to-complex), C2R (complex-to-real).
|
||||||
|
//
|
||||||
|
// Inverse transforms are scaled by 1/n (1D) or 1/(n*m) (2D) so that
|
||||||
|
// inv(fwd(x)) == x, matching Eigen's FFT convention.
|
||||||
|
//
|
||||||
|
// cuFFT plans are cached by (size, type) and reused across calls.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuFFT<float> fft;
|
||||||
|
// VectorXcf X = fft.fwd(x); // 1D C2C or R2C
|
||||||
|
// VectorXcf y = fft.inv(X); // 1D C2C inverse
|
||||||
|
// VectorXf r = fft.invReal(X, n); // 1D C2R inverse
|
||||||
|
// MatrixXcf B = fft.fwd2d(A); // 2D C2C forward
|
||||||
|
// MatrixXcf C = fft.inv2d(B); // 2D C2C inverse
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_FFT_H
|
||||||
|
#define EIGEN_GPU_FFT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuFftSupport.h"
|
||||||
|
#include "./CuBlasSupport.h"
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class GpuFFT {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using ComplexVector = Matrix<Complex, Dynamic, 1>;
|
||||||
|
using RealVector = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using ComplexMatrix = Matrix<Complex, Dynamic, Dynamic, ColMajor>;
|
||||||
|
|
||||||
|
GpuFFT() {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
~GpuFFT() {
|
||||||
|
for (auto& kv : plans_) (void)cufftDestroy(kv.second);
|
||||||
|
if (cublas_) (void)cublasDestroy(cublas_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuFFT(const GpuFFT&) = delete;
|
||||||
|
GpuFFT& operator=(const GpuFFT&) = delete;
|
||||||
|
|
||||||
|
// ---- 1D Complex-to-Complex ------------------------------------------------
|
||||||
|
|
||||||
|
/** Forward 1D C2C FFT. */
|
||||||
|
template <typename Derived>
|
||||||
|
ComplexVector fwd(const MatrixBase<Derived>& x,
|
||||||
|
typename std::enable_if<NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
|
||||||
|
const ComplexVector input(x.derived());
|
||||||
|
const int n = static_cast<int>(input.size());
|
||||||
|
if (n == 0) return ComplexVector(0);
|
||||||
|
|
||||||
|
ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
|
||||||
|
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
|
||||||
|
static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
|
||||||
|
|
||||||
|
ComplexVector result(n);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Inverse 1D C2C FFT. Scaled by 1/n. */
|
||||||
|
template <typename Derived>
|
||||||
|
ComplexVector inv(const MatrixBase<Derived>& X) {
|
||||||
|
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv() requires complex input");
|
||||||
|
const ComplexVector input(X.derived());
|
||||||
|
const int n = static_cast<int>(input.size());
|
||||||
|
if (n == 0) return ComplexVector(0);
|
||||||
|
|
||||||
|
ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
|
||||||
|
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
|
||||||
|
static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
|
||||||
|
|
||||||
|
// Scale by 1/n.
|
||||||
|
scale_device(static_cast<Complex*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
|
||||||
|
|
||||||
|
ComplexVector result(n);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 1D Real-to-Complex ---------------------------------------------------
|
||||||
|
|
||||||
|
/** Forward 1D R2C FFT. Returns n/2+1 complex values (half-spectrum). */
|
||||||
|
template <typename Derived>
|
||||||
|
ComplexVector fwd(const MatrixBase<Derived>& x,
|
||||||
|
typename std::enable_if<!NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
|
||||||
|
const RealVector input(x.derived());
|
||||||
|
const int n = static_cast<int>(input.size());
|
||||||
|
if (n == 0) return ComplexVector(0);
|
||||||
|
|
||||||
|
const int n_complex = n / 2 + 1;
|
||||||
|
ensure_buffers(n * sizeof(Scalar), n_complex * sizeof(Complex));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
cufftHandle plan = get_plan_1d(n, internal::cufft_r2c_type<Scalar>::value);
|
||||||
|
EIGEN_CUFFT_CHECK(
|
||||||
|
internal::cufftExecR2C_dispatch(plan, static_cast<Scalar*>(d_in_.ptr), static_cast<Complex*>(d_out_.ptr)));
|
||||||
|
|
||||||
|
ComplexVector result(n_complex);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(result.data(), d_out_.ptr, n_complex * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 1D Complex-to-Real ---------------------------------------------------
|
||||||
|
|
||||||
|
/** Inverse 1D C2R FFT. Input is n/2+1 complex values, output is nfft real values.
|
||||||
|
* Scaled by 1/nfft. Caller must specify nfft (original real signal length). */
|
||||||
|
template <typename Derived>
|
||||||
|
RealVector invReal(const MatrixBase<Derived>& X, Index nfft) {
|
||||||
|
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "invReal() requires complex input");
|
||||||
|
const ComplexVector input(X.derived());
|
||||||
|
const int n = static_cast<int>(nfft);
|
||||||
|
const int n_complex = n / 2 + 1;
|
||||||
|
eigen_assert(input.size() == n_complex);
|
||||||
|
if (n == 0) return RealVector(0);
|
||||||
|
|
||||||
|
ensure_buffers(n_complex * sizeof(Complex), n * sizeof(Scalar));
|
||||||
|
// cuFFT C2R may overwrite the input, so we copy to d_in_.
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_in_.ptr, input.data(), n_complex * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
cufftHandle plan = get_plan_1d(n, internal::cufft_c2r_type<Scalar>::value);
|
||||||
|
EIGEN_CUFFT_CHECK(
|
||||||
|
internal::cufftExecC2R_dispatch(plan, static_cast<Complex*>(d_in_.ptr), static_cast<Scalar*>(d_out_.ptr)));
|
||||||
|
|
||||||
|
// Scale by 1/n.
|
||||||
|
scale_device_real(static_cast<Scalar*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
|
||||||
|
|
||||||
|
RealVector result(n);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 2D Complex-to-Complex ------------------------------------------------
|
||||||
|
|
||||||
|
/** Forward 2D C2C FFT. Input and output are rows x cols complex matrices. */
|
||||||
|
template <typename Derived>
|
||||||
|
ComplexMatrix fwd2d(const MatrixBase<Derived>& A) {
|
||||||
|
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "fwd2d() requires complex input");
|
||||||
|
const ComplexMatrix input(A.derived());
|
||||||
|
const int rows = static_cast<int>(input.rows());
|
||||||
|
const int cols = static_cast<int>(input.cols());
|
||||||
|
if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
|
||||||
|
|
||||||
|
const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
|
||||||
|
ensure_buffers(total, total);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
|
||||||
|
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
|
||||||
|
static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
|
||||||
|
|
||||||
|
ComplexMatrix result(rows, cols);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Inverse 2D C2C FFT. Scaled by 1/(rows*cols). */
|
||||||
|
template <typename Derived>
|
||||||
|
ComplexMatrix inv2d(const MatrixBase<Derived>& A) {
|
||||||
|
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv2d() requires complex input");
|
||||||
|
const ComplexMatrix input(A.derived());
|
||||||
|
const int rows = static_cast<int>(input.rows());
|
||||||
|
const int cols = static_cast<int>(input.cols());
|
||||||
|
if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
|
||||||
|
|
||||||
|
const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
|
||||||
|
ensure_buffers(total, total);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
|
||||||
|
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
|
||||||
|
static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
|
||||||
|
|
||||||
|
// Scale by 1/(rows*cols).
|
||||||
|
const int total_elems = rows * cols;
|
||||||
|
scale_device(static_cast<Complex*>(d_out_.ptr), total_elems, Scalar(1) / Scalar(total_elems));
|
||||||
|
|
||||||
|
ComplexMatrix result(rows, cols);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors ------------------------------------------------------------
|
||||||
|
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cublasHandle_t cublas_ = nullptr;
|
||||||
|
std::map<int64_t, cufftHandle> plans_;
|
||||||
|
internal::DeviceBuffer d_in_;
|
||||||
|
internal::DeviceBuffer d_out_;
|
||||||
|
size_t d_in_size_ = 0;
|
||||||
|
size_t d_out_size_ = 0;
|
||||||
|
|
||||||
|
void ensure_buffers(size_t in_bytes, size_t out_bytes) {
|
||||||
|
if (in_bytes > d_in_size_) {
|
||||||
|
if (d_in_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
d_in_ = internal::DeviceBuffer(in_bytes);
|
||||||
|
d_in_size_ = in_bytes;
|
||||||
|
}
|
||||||
|
if (out_bytes > d_out_size_) {
|
||||||
|
if (d_out_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
d_out_ = internal::DeviceBuffer(out_bytes);
|
||||||
|
d_out_size_ = out_bytes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Plan key encoding: rank (1 bit) | type (4 bits) | dims
|
||||||
|
static int64_t plan_key_1d(int n, cufftType type) { return (int64_t(n) << 5) | (int64_t(type) << 1) | 0; }
|
||||||
|
|
||||||
|
static int64_t plan_key_2d(int rows, int cols, cufftType type) {
|
||||||
|
return (int64_t(rows) << 35) | (int64_t(cols) << 5) | (int64_t(type) << 1) | 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
cufftHandle get_plan_1d(int n, cufftType type) {
|
||||||
|
int64_t key = plan_key_1d(n, type);
|
||||||
|
auto it = plans_.find(key);
|
||||||
|
if (it != plans_.end()) return it->second;
|
||||||
|
|
||||||
|
cufftHandle plan;
|
||||||
|
EIGEN_CUFFT_CHECK(cufftPlan1d(&plan, n, type, /*batch=*/1));
|
||||||
|
EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
|
||||||
|
plans_[key] = plan;
|
||||||
|
return plan;
|
||||||
|
}
|
||||||
|
|
||||||
|
cufftHandle get_plan_2d(int rows, int cols, cufftType type) {
|
||||||
|
int64_t key = plan_key_2d(rows, cols, type);
|
||||||
|
auto it = plans_.find(key);
|
||||||
|
if (it != plans_.end()) return it->second;
|
||||||
|
|
||||||
|
// cuFFT uses row-major (C order) for 2D: first dim = rows, second = cols.
|
||||||
|
// Eigen matrices are column-major, so we pass (cols, rows) to cuFFT
|
||||||
|
// to get the correct 2D transform.
|
||||||
|
cufftHandle plan;
|
||||||
|
EIGEN_CUFFT_CHECK(cufftPlan2d(&plan, cols, rows, type));
|
||||||
|
EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
|
||||||
|
plans_[key] = plan;
|
||||||
|
return plan;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scale complex array on device using cuBLAS scal.
|
||||||
|
void scale_device(Complex* d_ptr, int n, Scalar alpha) { scale_complex(cublas_, d_ptr, n, alpha); }
|
||||||
|
|
||||||
|
// Scale real array on device using cuBLAS scal.
|
||||||
|
void scale_device_real(Scalar* d_ptr, int n, Scalar alpha) { scale_real(cublas_, d_ptr, n, alpha); }
|
||||||
|
|
||||||
|
// Type-dispatched cuBLAS scal wrappers (C++14 compatible).
|
||||||
|
static void scale_complex(cublasHandle_t h, std::complex<float>* p, int n, float a) {
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasCsscal(h, n, &a, reinterpret_cast<cuComplex*>(p), 1));
|
||||||
|
}
|
||||||
|
static void scale_complex(cublasHandle_t h, std::complex<double>* p, int n, double a) {
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasZdscal(h, n, &a, reinterpret_cast<cuDoubleComplex*>(p), 1));
|
||||||
|
}
|
||||||
|
static void scale_real(cublasHandle_t h, float* p, int n, float a) {
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSscal(h, n, &a, p, 1));
|
||||||
|
}
|
||||||
|
static void scale_real(cublasHandle_t h, double* p, int n, double a) {
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasDscal(h, n, &a, p, 1));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_FFT_H
|
||||||
@@ -149,7 +149,7 @@ class GpuLLT {
|
|||||||
|
|
||||||
// Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions).
|
// Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions).
|
||||||
const PlainMatrix mat(A.derived());
|
const PlainMatrix mat(A.derived());
|
||||||
lda_ = static_cast<int64_t>(mat.outerStride());
|
lda_ = static_cast<int64_t>(mat.rows());
|
||||||
allocate_factor_storage();
|
allocate_factor_storage();
|
||||||
EIGEN_CUDA_RUNTIME_CHECK(
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
cudaMemcpyAsync(d_factor_.ptr, mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_));
|
cudaMemcpyAsync(d_factor_.ptr, mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_));
|
||||||
@@ -163,7 +163,7 @@ class GpuLLT {
|
|||||||
eigen_assert(d_A.rows() == d_A.cols());
|
eigen_assert(d_A.rows() == d_A.cols());
|
||||||
if (!begin_compute(d_A.rows())) return *this;
|
if (!begin_compute(d_A.rows())) return *this;
|
||||||
|
|
||||||
lda_ = static_cast<int64_t>(d_A.outerStride());
|
lda_ = static_cast<int64_t>(d_A.rows());
|
||||||
d_A.waitReady(stream_);
|
d_A.waitReady(stream_);
|
||||||
allocate_factor_storage();
|
allocate_factor_storage();
|
||||||
EIGEN_CUDA_RUNTIME_CHECK(
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
@@ -178,7 +178,7 @@ class GpuLLT {
|
|||||||
eigen_assert(d_A.rows() == d_A.cols());
|
eigen_assert(d_A.rows() == d_A.cols());
|
||||||
if (!begin_compute(d_A.rows())) return *this;
|
if (!begin_compute(d_A.rows())) return *this;
|
||||||
|
|
||||||
lda_ = static_cast<int64_t>(d_A.outerStride());
|
lda_ = static_cast<int64_t>(d_A.rows());
|
||||||
d_A.waitReady(stream_);
|
d_A.waitReady(stream_);
|
||||||
d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
|
d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
|
||||||
|
|
||||||
@@ -205,7 +205,7 @@ class GpuLLT {
|
|||||||
|
|
||||||
const PlainMatrix rhs(B);
|
const PlainMatrix rhs(B);
|
||||||
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||||
const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
|
const int64_t ldb = static_cast<int64_t>(rhs.rows());
|
||||||
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
|
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
|
||||||
EIGEN_CUDA_RUNTIME_CHECK(
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
cudaMemcpyAsync(d_x_ptr, rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
|
cudaMemcpyAsync(d_x_ptr, rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
|
||||||
@@ -234,7 +234,7 @@ class GpuLLT {
|
|||||||
eigen_assert(d_B.rows() == n_);
|
eigen_assert(d_B.rows() == n_);
|
||||||
d_B.waitReady(stream_);
|
d_B.waitReady(stream_);
|
||||||
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
||||||
const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
|
const int64_t ldb = static_cast<int64_t>(d_B.rows());
|
||||||
return solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
|
return solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
|
||||||
EIGEN_CUDA_RUNTIME_CHECK(
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
cudaMemcpyAsync(d_x_ptr, d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
|
cudaMemcpyAsync(d_x_ptr, d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
|
||||||
@@ -332,7 +332,7 @@ class GpuLLT {
|
|||||||
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, static_cast<int64_t>(n_), nrhs, dtype,
|
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, static_cast<int64_t>(n_), nrhs, dtype,
|
||||||
d_factor_.ptr, lda_, dtype, d_x_ptr, ldb, scratch_info()));
|
d_factor_.ptr, lda_, dtype, d_x_ptr, ldb, scratch_info()));
|
||||||
|
|
||||||
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs), static_cast<Index>(ldb));
|
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs));
|
||||||
result.recordReady(stream_);
|
result.recordReady(stream_);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -140,7 +140,7 @@ class GpuLU {
|
|||||||
if (!begin_compute(A.rows())) return *this;
|
if (!begin_compute(A.rows())) return *this;
|
||||||
|
|
||||||
const PlainMatrix mat(A.derived());
|
const PlainMatrix mat(A.derived());
|
||||||
lda_ = static_cast<int64_t>(mat.outerStride());
|
lda_ = static_cast<int64_t>(mat.rows());
|
||||||
allocate_lu_storage();
|
allocate_lu_storage();
|
||||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_));
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
@@ -153,7 +153,7 @@ class GpuLU {
|
|||||||
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
|
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
|
||||||
if (!begin_compute(d_A.rows())) return *this;
|
if (!begin_compute(d_A.rows())) return *this;
|
||||||
|
|
||||||
lda_ = static_cast<int64_t>(d_A.outerStride());
|
lda_ = static_cast<int64_t>(d_A.rows());
|
||||||
d_A.waitReady(stream_);
|
d_A.waitReady(stream_);
|
||||||
allocate_lu_storage();
|
allocate_lu_storage();
|
||||||
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_));
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_));
|
||||||
@@ -167,7 +167,7 @@ class GpuLU {
|
|||||||
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
|
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
|
||||||
if (!begin_compute(d_A.rows())) return *this;
|
if (!begin_compute(d_A.rows())) return *this;
|
||||||
|
|
||||||
lda_ = static_cast<int64_t>(d_A.outerStride());
|
lda_ = static_cast<int64_t>(d_A.rows());
|
||||||
d_A.waitReady(stream_);
|
d_A.waitReady(stream_);
|
||||||
d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
|
d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
|
||||||
|
|
||||||
@@ -190,7 +190,7 @@ class GpuLU {
|
|||||||
|
|
||||||
const PlainMatrix rhs(B);
|
const PlainMatrix rhs(B);
|
||||||
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||||
const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
|
const int64_t ldb = static_cast<int64_t>(rhs.rows());
|
||||||
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
|
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
|
||||||
EIGEN_CUDA_RUNTIME_CHECK(
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
cudaMemcpyAsync(d_x_ptr, rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
|
cudaMemcpyAsync(d_x_ptr, rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
|
||||||
@@ -213,7 +213,7 @@ class GpuLU {
|
|||||||
eigen_assert(d_B.rows() == n_);
|
eigen_assert(d_B.rows() == n_);
|
||||||
d_B.waitReady(stream_);
|
d_B.waitReady(stream_);
|
||||||
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
||||||
const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
|
const int64_t ldb = static_cast<int64_t>(d_B.rows());
|
||||||
return solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
|
return solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
|
||||||
EIGEN_CUDA_RUNTIME_CHECK(
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
cudaMemcpyAsync(d_x_ptr, d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
|
cudaMemcpyAsync(d_x_ptr, d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
|
||||||
@@ -305,7 +305,7 @@ class GpuLU {
|
|||||||
lda_, static_cast<const int64_t*>(d_ipiv_.ptr), dtype, d_x_ptr, ldb,
|
lda_, static_cast<const int64_t*>(d_ipiv_.ptr), dtype, d_x_ptr, ldb,
|
||||||
scratch_info()));
|
scratch_info()));
|
||||||
|
|
||||||
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs), static_cast<Index>(ldb));
|
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs));
|
||||||
result.recordReady(stream_);
|
result.recordReady(stream_);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|||||||
389
Eigen/src/GPU/GpuQR.h
Normal file
389
Eigen/src/GPU/GpuQR.h
Normal file
@@ -0,0 +1,389 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU QR decomposition using cuSOLVER.
|
||||||
|
//
|
||||||
|
// Wraps cusolverDnXgeqrf (factorization), cusolverDnXormqr (apply Q),
|
||||||
|
// cusolverDnXorgqr (form Q), and cublasXtrsm (triangular solve on R).
|
||||||
|
//
|
||||||
|
// The factored matrix (reflectors + R) and tau stay in device memory.
|
||||||
|
// Solve uses ormqr + trsm without forming Q explicitly.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuQR<double> qr(A); // upload A, geqrf
|
||||||
|
// if (qr.info() != Success) { ... }
|
||||||
|
// MatrixXd X = qr.solve(B); // Q^H * B via ormqr, then trsm on R
|
||||||
|
//
|
||||||
|
// Expression syntax:
|
||||||
|
// d_X = d_A.qr().solve(d_B); // temporary, no caching
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_QR_H
|
||||||
|
#define EIGEN_GPU_QR_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuSolverSupport.h"
|
||||||
|
#include "./CuBlasSupport.h"
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class GpuQR {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||||
|
|
||||||
|
GpuQR() { init_context(); }
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
explicit GpuQR(const EigenBase<InputType>& A) {
|
||||||
|
init_context();
|
||||||
|
compute(A);
|
||||||
|
}
|
||||||
|
|
||||||
|
~GpuQR() {
|
||||||
|
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||||
|
if (cublas_) (void)cublasDestroy(cublas_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuQR(const GpuQR&) = delete;
|
||||||
|
GpuQR& operator=(const GpuQR&) = delete;
|
||||||
|
|
||||||
|
GpuQR(GpuQR&& o) noexcept
|
||||||
|
: stream_(o.stream_),
|
||||||
|
handle_(o.handle_),
|
||||||
|
cublas_(o.cublas_),
|
||||||
|
params_(std::move(o.params_)),
|
||||||
|
d_qr_(std::move(o.d_qr_)),
|
||||||
|
d_tau_(std::move(o.d_tau_)),
|
||||||
|
d_scratch_(std::move(o.d_scratch_)),
|
||||||
|
scratch_size_(o.scratch_size_),
|
||||||
|
h_workspace_(std::move(o.h_workspace_)),
|
||||||
|
m_(o.m_),
|
||||||
|
n_(o.n_),
|
||||||
|
lda_(o.lda_),
|
||||||
|
info_(o.info_),
|
||||||
|
info_word_(o.info_word_),
|
||||||
|
info_synced_(o.info_synced_) {
|
||||||
|
o.stream_ = nullptr;
|
||||||
|
o.handle_ = nullptr;
|
||||||
|
o.cublas_ = nullptr;
|
||||||
|
o.scratch_size_ = 0;
|
||||||
|
o.m_ = 0;
|
||||||
|
o.n_ = 0;
|
||||||
|
o.lda_ = 0;
|
||||||
|
o.info_ = InvalidInput;
|
||||||
|
o.info_word_ = 0;
|
||||||
|
o.info_synced_ = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuQR& operator=(GpuQR&& o) noexcept {
|
||||||
|
if (this != &o) {
|
||||||
|
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||||
|
if (cublas_) (void)cublasDestroy(cublas_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
stream_ = o.stream_;
|
||||||
|
handle_ = o.handle_;
|
||||||
|
cublas_ = o.cublas_;
|
||||||
|
params_ = std::move(o.params_);
|
||||||
|
d_qr_ = std::move(o.d_qr_);
|
||||||
|
d_tau_ = std::move(o.d_tau_);
|
||||||
|
d_scratch_ = std::move(o.d_scratch_);
|
||||||
|
scratch_size_ = o.scratch_size_;
|
||||||
|
h_workspace_ = std::move(o.h_workspace_);
|
||||||
|
m_ = o.m_;
|
||||||
|
n_ = o.n_;
|
||||||
|
lda_ = o.lda_;
|
||||||
|
info_ = o.info_;
|
||||||
|
info_word_ = o.info_word_;
|
||||||
|
info_synced_ = o.info_synced_;
|
||||||
|
o.stream_ = nullptr;
|
||||||
|
o.handle_ = nullptr;
|
||||||
|
o.cublas_ = nullptr;
|
||||||
|
o.scratch_size_ = 0;
|
||||||
|
o.m_ = 0;
|
||||||
|
o.n_ = 0;
|
||||||
|
o.lda_ = 0;
|
||||||
|
o.info_ = InvalidInput;
|
||||||
|
o.info_word_ = 0;
|
||||||
|
o.info_synced_ = true;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Factorization -------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
GpuQR& compute(const EigenBase<InputType>& A) {
|
||||||
|
m_ = A.rows();
|
||||||
|
n_ = A.cols();
|
||||||
|
info_ = InvalidInput;
|
||||||
|
info_synced_ = false;
|
||||||
|
|
||||||
|
if (m_ == 0 || n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
info_synced_ = true;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
const PlainMatrix mat(A.derived());
|
||||||
|
lda_ = static_cast<int64_t>(mat.rows());
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||||
|
const size_t tau_bytes = static_cast<size_t>((std::min)(m_, n_)) * sizeof(Scalar);
|
||||||
|
|
||||||
|
d_qr_ = internal::DeviceBuffer(mat_bytes);
|
||||||
|
d_tau_ = internal::DeviceBuffer(tau_bytes);
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_qr_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuQR& compute(const DeviceMatrix<Scalar>& d_A) {
|
||||||
|
m_ = d_A.rows();
|
||||||
|
n_ = d_A.cols();
|
||||||
|
info_ = InvalidInput;
|
||||||
|
info_synced_ = false;
|
||||||
|
|
||||||
|
if (m_ == 0 || n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
info_synced_ = true;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
lda_ = static_cast<int64_t>(d_A.rows());
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||||
|
const size_t tau_bytes = static_cast<size_t>((std::min)(m_, n_)) * sizeof(Scalar);
|
||||||
|
|
||||||
|
d_A.waitReady(stream_);
|
||||||
|
d_qr_ = internal::DeviceBuffer(mat_bytes);
|
||||||
|
d_tau_ = internal::DeviceBuffer(tau_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_qr_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve ---------------------------------------------------------------
|
||||||
|
|
||||||
|
/** Solve A * X = B via QR: X = R^{-1} * Q^H * B (least-squares for m >= n).
|
||||||
|
* Uses ormqr (apply Q^H) + trsm (solve R), without forming Q explicitly.
|
||||||
|
* Requires m >= n (overdetermined or square). Underdetermined not supported.
|
||||||
|
*
|
||||||
|
* TODO: Add device-side accessor for the R factor (and Q application) as
|
||||||
|
* DeviceMatrix, so users can chain GPU operations without host round-trips. */
|
||||||
|
template <typename Rhs>
|
||||||
|
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success && "GpuQR::solve called on a failed or uninitialized factorization");
|
||||||
|
eigen_assert(B.rows() == m_);
|
||||||
|
eigen_assert(m_ >= n_ && "GpuQR::solve requires m >= n (use SVD for underdetermined systems)");
|
||||||
|
|
||||||
|
const PlainMatrix rhs(B);
|
||||||
|
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||||
|
const int64_t ldb = static_cast<int64_t>(rhs.rows()); // = m_
|
||||||
|
const size_t b_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||||
|
|
||||||
|
// Upload B to device (m × nrhs buffer).
|
||||||
|
internal::DeviceBuffer d_B(b_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.ptr, rhs.data(), b_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
// Apply Q^H to B in-place: d_B becomes m × nrhs, first n rows hold Q^H * B relevant part.
|
||||||
|
apply_QH(d_B.ptr, ldb, nrhs);
|
||||||
|
|
||||||
|
// Solve R * X = (Q^H * B)[0:n,:] via trsm on the first n rows.
|
||||||
|
Scalar alpha(1);
|
||||||
|
EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
|
||||||
|
CUBLAS_DIAG_NON_UNIT, static_cast<int>(n_), static_cast<int>(nrhs), &alpha,
|
||||||
|
static_cast<const Scalar*>(d_qr_.ptr), static_cast<int>(lda_),
|
||||||
|
static_cast<Scalar*>(d_B.ptr), static_cast<int>(ldb)));
|
||||||
|
|
||||||
|
// Download the first n rows of each column (stride = ldb = m, width = n).
|
||||||
|
PlainMatrix X(n_, rhs.cols());
|
||||||
|
if (m_ == n_) {
|
||||||
|
// Square: dense copy, no stride mismatch.
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_B.ptr,
|
||||||
|
static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost, stream_));
|
||||||
|
} else {
|
||||||
|
// Overdetermined: 2D copy to extract first n rows from each column.
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(
|
||||||
|
X.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_B.ptr, static_cast<size_t>(ldb) * sizeof(Scalar),
|
||||||
|
static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
}
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return X;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Solve with device-resident RHS. Returns n × nrhs DeviceMatrix. */
|
||||||
|
DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success && "GpuQR::solve called on a failed or uninitialized factorization");
|
||||||
|
eigen_assert(d_B.rows() == m_);
|
||||||
|
eigen_assert(m_ >= n_ && "GpuQR::solve requires m >= n (use SVD for underdetermined systems)");
|
||||||
|
d_B.waitReady(stream_);
|
||||||
|
|
||||||
|
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
||||||
|
const int64_t ldb = static_cast<int64_t>(d_B.rows()); // = m_
|
||||||
|
const size_t b_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||||
|
|
||||||
|
// D2D copy B into working buffer (ormqr and trsm are in-place).
|
||||||
|
internal::DeviceBuffer d_work(b_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_work.ptr, d_B.data(), b_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||||
|
|
||||||
|
apply_QH(d_work.ptr, ldb, nrhs);
|
||||||
|
|
||||||
|
// trsm on the first n rows.
|
||||||
|
Scalar alpha(1);
|
||||||
|
EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
|
||||||
|
CUBLAS_DIAG_NON_UNIT, static_cast<int>(n_), static_cast<int>(nrhs), &alpha,
|
||||||
|
static_cast<const Scalar*>(d_qr_.ptr), static_cast<int>(lda_),
|
||||||
|
static_cast<Scalar*>(d_work.ptr), static_cast<int>(ldb)));
|
||||||
|
|
||||||
|
if (m_ == n_) {
|
||||||
|
// Square: result is the whole buffer, dense.
|
||||||
|
DeviceMatrix<Scalar> result(static_cast<Scalar*>(d_work.ptr), n_, static_cast<Index>(nrhs));
|
||||||
|
d_work.ptr = nullptr; // transfer ownership
|
||||||
|
result.recordReady(stream_);
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
// Overdetermined: copy first n rows of each column into a dense n × nrhs result.
|
||||||
|
DeviceMatrix<Scalar> result(n_, static_cast<Index>(nrhs));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(result.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_work.ptr,
|
||||||
|
static_cast<size_t>(ldb) * sizeof(Scalar),
|
||||||
|
static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs),
|
||||||
|
cudaMemcpyDeviceToDevice, stream_));
|
||||||
|
result.recordReady(stream_);
|
||||||
|
return result;
|
||||||
|
// d_work freed here via RAII — safe because stream is ordered.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors -----------------------------------------------------------
|
||||||
|
|
||||||
|
ComputationInfo info() const {
|
||||||
|
sync_info();
|
||||||
|
return info_;
|
||||||
|
}
|
||||||
|
|
||||||
|
Index rows() const { return m_; }
|
||||||
|
Index cols() const { return n_; }
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cusolverDnHandle_t handle_ = nullptr;
|
||||||
|
cublasHandle_t cublas_ = nullptr;
|
||||||
|
internal::CusolverParams params_;
|
||||||
|
internal::DeviceBuffer d_qr_; // QR factors (reflectors in lower, R in upper)
|
||||||
|
internal::DeviceBuffer d_tau_; // Householder scalars (min(m,n))
|
||||||
|
internal::DeviceBuffer d_scratch_; // workspace + info word
|
||||||
|
size_t scratch_size_ = 0;
|
||||||
|
std::vector<char> h_workspace_;
|
||||||
|
Index m_ = 0;
|
||||||
|
Index n_ = 0;
|
||||||
|
int64_t lda_ = 0;
|
||||||
|
ComputationInfo info_ = InvalidInput;
|
||||||
|
int info_word_ = 0;
|
||||||
|
bool info_synced_ = true;
|
||||||
|
|
||||||
|
void init_context() {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
|
||||||
|
ensure_scratch(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ensure_scratch(size_t workspace_bytes) {
|
||||||
|
constexpr size_t kAlign = 16;
|
||||||
|
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||||
|
size_t needed = workspace_bytes + sizeof(int);
|
||||||
|
if (needed > scratch_size_) {
|
||||||
|
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
d_scratch_ = internal::DeviceBuffer(needed);
|
||||||
|
scratch_size_ = needed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||||
|
int* scratch_info() const {
|
||||||
|
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||||
|
}
|
||||||
|
|
||||||
|
void sync_info() const {
|
||||||
|
if (!info_synced_) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
const_cast<GpuQR*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||||
|
const_cast<GpuQR*>(this)->info_synced_ = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void factorize() {
|
||||||
|
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||||
|
|
||||||
|
info_synced_ = false;
|
||||||
|
info_ = InvalidInput;
|
||||||
|
|
||||||
|
size_t dev_ws = 0, host_ws = 0;
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf_bufferSize(handle_, params_.p, static_cast<int64_t>(m_),
|
||||||
|
static_cast<int64_t>(n_), dtype, d_qr_.ptr, lda_, dtype,
|
||||||
|
d_tau_.ptr, dtype, &dev_ws, &host_ws));
|
||||||
|
|
||||||
|
ensure_scratch(dev_ws);
|
||||||
|
h_workspace_.resize(host_ws);
|
||||||
|
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf(handle_, params_.p, static_cast<int64_t>(m_), static_cast<int64_t>(n_), dtype,
|
||||||
|
d_qr_.ptr, lda_, dtype, d_tau_.ptr, dtype, scratch_workspace(), dev_ws,
|
||||||
|
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply Q^H to a device buffer in-place: d_B = Q^H * d_B.
|
||||||
|
// Uses type-specific ormqr (real) or unmqr (complex) wrappers from CuSolverSupport.h.
|
||||||
|
// For real types: Q^H = Q^T, use CUBLAS_OP_T. For complex: use CUBLAS_OP_C.
|
||||||
|
void apply_QH(void* d_B, int64_t ldb, int64_t nrhs) const {
|
||||||
|
const int im = static_cast<int>(m_);
|
||||||
|
const int in = static_cast<int>(nrhs);
|
||||||
|
const int ik = static_cast<int>((std::min)(m_, n_));
|
||||||
|
const int ilda = static_cast<int>(lda_);
|
||||||
|
const int ildb = static_cast<int>(ldb);
|
||||||
|
constexpr cublasOperation_t trans = NumTraits<Scalar>::IsComplex ? CUBLAS_OP_C : CUBLAS_OP_T;
|
||||||
|
|
||||||
|
int lwork = 0;
|
||||||
|
EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr_bufferSize(
|
||||||
|
handle_, CUBLAS_SIDE_LEFT, trans, im, in, ik, static_cast<const Scalar*>(d_qr_.ptr), ilda,
|
||||||
|
static_cast<const Scalar*>(d_tau_.ptr), static_cast<const Scalar*>(d_B), ildb, &lwork));
|
||||||
|
|
||||||
|
internal::DeviceBuffer d_work(static_cast<size_t>(lwork) * sizeof(Scalar));
|
||||||
|
|
||||||
|
EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr(handle_, CUBLAS_SIDE_LEFT, trans, im, in, ik,
|
||||||
|
static_cast<const Scalar*>(d_qr_.ptr), ilda,
|
||||||
|
static_cast<const Scalar*>(d_tau_.ptr), static_cast<Scalar*>(d_B),
|
||||||
|
ildb, static_cast<Scalar*>(d_work.ptr), lwork, scratch_info()));
|
||||||
|
|
||||||
|
// Sync to ensure workspace can be freed safely, and check ormqr info.
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
int ormqr_info = 0;
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(&ormqr_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost));
|
||||||
|
eigen_assert(ormqr_info == 0 && "cusolverDnXormqr reported an error");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_QR_H
|
||||||
490
Eigen/src/GPU/GpuSVD.h
Normal file
490
Eigen/src/GPU/GpuSVD.h
Normal file
@@ -0,0 +1,490 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU SVD decomposition using cuSOLVER (divide-and-conquer).
|
||||||
|
//
|
||||||
|
// Wraps cusolverDnXgesvd. Stores U, S, VT on device. Solve uses
|
||||||
|
// cuBLAS GEMM: X = VT^H * diag(D) * U^H * B.
|
||||||
|
//
|
||||||
|
// cuSOLVER returns VT (not V). We store and expose VT directly.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuSVD<double> svd(A, ComputeThinU | ComputeThinV);
|
||||||
|
// VectorXd S = svd.singularValues();
|
||||||
|
// MatrixXd U = svd.matrixU(); // m×k or m×m
|
||||||
|
// MatrixXd VT = svd.matrixVT(); // k×n or n×n (this is V^T)
|
||||||
|
// MatrixXd X = svd.solve(B); // pseudoinverse
|
||||||
|
// MatrixXd X = svd.solve(B, k); // truncated (top k triplets)
|
||||||
|
// MatrixXd X = svd.solve(B, 0.1); // Tikhonov regularized
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_SVD_H
|
||||||
|
#define EIGEN_GPU_SVD_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuSolverSupport.h"
|
||||||
|
#include "./CuBlasSupport.h"
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class GpuSVD {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||||
|
using RealVector = Matrix<RealScalar, Dynamic, 1>;
|
||||||
|
|
||||||
|
GpuSVD() { init_context(); }
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
explicit GpuSVD(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
|
||||||
|
init_context();
|
||||||
|
compute(A, options);
|
||||||
|
}
|
||||||
|
|
||||||
|
~GpuSVD() {
|
||||||
|
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||||
|
if (cublas_) (void)cublasDestroy(cublas_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuSVD(const GpuSVD&) = delete;
|
||||||
|
GpuSVD& operator=(const GpuSVD&) = delete;
|
||||||
|
// Move constructors omitted for brevity — follow GpuQR pattern.
|
||||||
|
|
||||||
|
// ---- Factorization -------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
GpuSVD& compute(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
|
||||||
|
options_ = options;
|
||||||
|
m_ = A.rows();
|
||||||
|
n_ = A.cols();
|
||||||
|
info_ = InvalidInput;
|
||||||
|
info_synced_ = false;
|
||||||
|
|
||||||
|
if (m_ == 0 || n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
info_synced_ = true;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// cuSOLVER gesvd requires m >= n. For wide matrices, transpose internally.
|
||||||
|
transposed_ = (m_ < n_);
|
||||||
|
const PlainMatrix mat = transposed_ ? PlainMatrix(A.derived().adjoint()) : PlainMatrix(A.derived());
|
||||||
|
if (transposed_) std::swap(m_, n_);
|
||||||
|
|
||||||
|
lda_ = static_cast<int64_t>(mat.rows());
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||||
|
|
||||||
|
// Copy (possibly transposed) A to device (gesvd overwrites it).
|
||||||
|
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuSVD& compute(const DeviceMatrix<Scalar>& d_A, unsigned int options = ComputeThinU | ComputeThinV) {
|
||||||
|
options_ = options;
|
||||||
|
m_ = d_A.rows();
|
||||||
|
n_ = d_A.cols();
|
||||||
|
info_ = InvalidInput;
|
||||||
|
info_synced_ = false;
|
||||||
|
|
||||||
|
if (m_ == 0 || n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
info_synced_ = true;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
transposed_ = (m_ < n_);
|
||||||
|
d_A.waitReady(stream_);
|
||||||
|
|
||||||
|
if (transposed_) {
|
||||||
|
// Transpose on device via cuBLAS geam: d_A_ = A^H.
|
||||||
|
std::swap(m_, n_);
|
||||||
|
lda_ = m_;
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||||
|
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||||
|
Scalar alpha_one(1), beta_zero(0);
|
||||||
|
// geam: C(m×n) = alpha * op(A) + beta * op(B). Use B = nullptr trick: beta=0.
|
||||||
|
// A is the original d_A (n_orig × m_orig = n × m after swap), transposed → m × n.
|
||||||
|
EIGEN_CUBLAS_CHECK(internal::cublasXgeam(
|
||||||
|
cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(m_), static_cast<int>(n_), &alpha_one, d_A.data(),
|
||||||
|
static_cast<int>(d_A.rows()), &beta_zero, static_cast<const Scalar*>(nullptr), static_cast<int>(m_),
|
||||||
|
static_cast<Scalar*>(d_A_.ptr), static_cast<int>(m_)));
|
||||||
|
} else {
|
||||||
|
lda_ = static_cast<int64_t>(d_A.rows());
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||||
|
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors -----------------------------------------------------------
|
||||||
|
|
||||||
|
ComputationInfo info() const {
|
||||||
|
sync_info();
|
||||||
|
return info_;
|
||||||
|
}
|
||||||
|
|
||||||
|
Index rows() const { return transposed_ ? n_ : m_; }
|
||||||
|
Index cols() const { return transposed_ ? m_ : n_; }
|
||||||
|
|
||||||
|
// TODO: Add device-side accessors (deviceU(), deviceVT(), deviceSingularValues())
|
||||||
|
// returning DeviceMatrix views of the internal buffers, so users can chain
|
||||||
|
// GPU operations without round-tripping through host memory.
|
||||||
|
|
||||||
|
/** Singular values (always available). Downloads from device on each call. */
|
||||||
|
RealVector singularValues() const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success);
|
||||||
|
const Index k = (std::min)(m_, n_);
|
||||||
|
RealVector S(k);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpy(S.data(), d_S_.ptr, static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
|
||||||
|
return S;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Left singular vectors U. Returns m_orig × k or m_orig × m_orig.
|
||||||
|
* For transposed case (m_orig < n_orig), U comes from cuSOLVER's VT. */
|
||||||
|
PlainMatrix matrixU() const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success);
|
||||||
|
eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "matrixU() requires ComputeThinU or ComputeFullU");
|
||||||
|
const Index m_orig = transposed_ ? n_ : m_;
|
||||||
|
const Index n_orig = transposed_ ? m_ : n_;
|
||||||
|
const Index k = (std::min)(m_orig, n_orig);
|
||||||
|
if (!transposed_) {
|
||||||
|
const Index ucols = (options_ & ComputeFullU) ? m_ : k;
|
||||||
|
PlainMatrix U(m_, ucols);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U.data(), d_U_.ptr,
|
||||||
|
static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
return U;
|
||||||
|
} else {
|
||||||
|
// Transposed: U_orig = VT_stored^H. VT_stored is vtrows × n_ (= vtrows × m_orig).
|
||||||
|
const Index vtrows = (options_ & ComputeFullU) ? m_orig : k; // Note: FullU maps to FullV of A^H
|
||||||
|
PlainMatrix VT_stored(vtrows, n_);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT_stored.data(), d_VT_.ptr,
|
||||||
|
static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
return VT_stored.adjoint(); // m_orig × vtrows
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Right singular vectors transposed V^T. Returns k × n_orig or n_orig × n_orig.
|
||||||
|
* For transposed case, VT comes from cuSOLVER's U. */
|
||||||
|
PlainMatrix matrixVT() const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success);
|
||||||
|
eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "matrixVT() requires ComputeThinV or ComputeFullV");
|
||||||
|
const Index m_orig = transposed_ ? n_ : m_;
|
||||||
|
const Index n_orig = transposed_ ? m_ : n_;
|
||||||
|
const Index k = (std::min)(m_orig, n_orig);
|
||||||
|
if (!transposed_) {
|
||||||
|
const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
|
||||||
|
PlainMatrix VT(vtrows, n_);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT.data(), d_VT_.ptr,
|
||||||
|
static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
return VT;
|
||||||
|
} else {
|
||||||
|
// Transposed: VT_orig = U_stored^H. U_stored is m_ × ucols (= n_orig × ucols).
|
||||||
|
const Index ucols = (options_ & ComputeFullV) ? n_orig : k; // FullV maps to FullU of A^H
|
||||||
|
PlainMatrix U_stored(m_, ucols);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U_stored.data(), d_U_.ptr,
|
||||||
|
static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
return U_stored.adjoint(); // ucols × n_orig
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Number of singular values above threshold. */
|
||||||
|
Index rank(RealScalar threshold = RealScalar(-1)) const {
|
||||||
|
RealVector S = singularValues();
|
||||||
|
if (S.size() == 0) return 0;
|
||||||
|
if (threshold < 0) {
|
||||||
|
threshold = (std::max)(m_, n_) * S(0) * NumTraits<RealScalar>::epsilon();
|
||||||
|
}
|
||||||
|
Index r = 0;
|
||||||
|
for (Index i = 0; i < S.size(); ++i) {
|
||||||
|
if (S(i) > threshold) ++r;
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve ---------------------------------------------------------------
|
||||||
|
|
||||||
|
/** Pseudoinverse solve: X = V * diag(1/S) * U^H * B. */
|
||||||
|
template <typename Rhs>
|
||||||
|
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
|
||||||
|
return solve_impl(B, (std::min)(m_, n_), RealScalar(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Truncated solve: use only top trunc singular triplets. */
|
||||||
|
template <typename Rhs>
|
||||||
|
PlainMatrix solve(const MatrixBase<Rhs>& B, Index trunc) const {
|
||||||
|
eigen_assert(trunc > 0 && trunc <= (std::min)(m_, n_));
|
||||||
|
return solve_impl(B, trunc, RealScalar(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Tikhonov-regularized solve: D_ii = S_i / (S_i^2 + lambda^2). */
|
||||||
|
template <typename Rhs>
|
||||||
|
PlainMatrix solve(const MatrixBase<Rhs>& B, RealScalar lambda) const {
|
||||||
|
eigen_assert(lambda > 0);
|
||||||
|
return solve_impl(B, (std::min)(m_, n_), lambda);
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cusolverDnHandle_t handle_ = nullptr;
|
||||||
|
cublasHandle_t cublas_ = nullptr;
|
||||||
|
internal::CusolverParams params_;
|
||||||
|
internal::DeviceBuffer d_A_; // working copy of A (overwritten by gesvd)
|
||||||
|
internal::DeviceBuffer d_U_; // left singular vectors
|
||||||
|
internal::DeviceBuffer d_S_; // singular values (RealScalar)
|
||||||
|
internal::DeviceBuffer d_VT_; // right singular vectors transposed
|
||||||
|
internal::DeviceBuffer d_scratch_; // workspace + info
|
||||||
|
size_t scratch_size_ = 0;
|
||||||
|
std::vector<char> h_workspace_;
|
||||||
|
unsigned int options_ = 0;
|
||||||
|
Index m_ = 0;
|
||||||
|
Index n_ = 0;
|
||||||
|
int64_t lda_ = 0;
|
||||||
|
bool transposed_ = false; // true if m < n (we compute SVD of A^T internally)
|
||||||
|
ComputationInfo info_ = InvalidInput;
|
||||||
|
int info_word_ = 0;
|
||||||
|
bool info_synced_ = true;
|
||||||
|
|
||||||
|
void init_context() {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
|
||||||
|
ensure_scratch(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ensure_scratch(size_t workspace_bytes) {
|
||||||
|
constexpr size_t kAlign = 16;
|
||||||
|
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||||
|
size_t needed = workspace_bytes + sizeof(int);
|
||||||
|
if (needed > scratch_size_) {
|
||||||
|
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
d_scratch_ = internal::DeviceBuffer(needed);
|
||||||
|
scratch_size_ = needed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||||
|
int* scratch_info() const {
|
||||||
|
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||||
|
}
|
||||||
|
|
||||||
|
void sync_info() const {
|
||||||
|
if (!info_synced_) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
const_cast<GpuSVD*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||||
|
const_cast<GpuSVD*>(this)->info_synced_ = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Swap U↔V flags for the transposed case.
|
||||||
|
static unsigned int swap_uv_options(unsigned int opts) {
|
||||||
|
unsigned int result = 0;
|
||||||
|
if (opts & ComputeThinU) result |= ComputeThinV;
|
||||||
|
if (opts & ComputeFullU) result |= ComputeFullV;
|
||||||
|
if (opts & ComputeThinV) result |= ComputeThinU;
|
||||||
|
if (opts & ComputeFullV) result |= ComputeFullU;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static signed char jobu(unsigned int opts) {
|
||||||
|
if (opts & ComputeFullU) return 'A';
|
||||||
|
if (opts & ComputeThinU) return 'S';
|
||||||
|
return 'N';
|
||||||
|
}
|
||||||
|
|
||||||
|
static signed char jobvt(unsigned int opts) {
|
||||||
|
if (opts & ComputeFullV) return 'A';
|
||||||
|
if (opts & ComputeThinV) return 'S';
|
||||||
|
return 'N';
|
||||||
|
}
|
||||||
|
|
||||||
|
void factorize() {
|
||||||
|
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||||
|
constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
|
||||||
|
const Index k = (std::min)(m_, n_);
|
||||||
|
|
||||||
|
info_synced_ = false;
|
||||||
|
info_ = InvalidInput;
|
||||||
|
|
||||||
|
// Allocate output buffers. When transposed, swap U/V roles for cuSOLVER.
|
||||||
|
d_S_ = internal::DeviceBuffer(static_cast<size_t>(k) * sizeof(RealScalar));
|
||||||
|
|
||||||
|
// Internal options: for transposed case, what user wants as U we compute as VT of A^H.
|
||||||
|
const unsigned int int_opts = transposed_ ? swap_uv_options(options_) : options_;
|
||||||
|
|
||||||
|
const Index ucols = (int_opts & ComputeFullU) ? m_ : ((int_opts & ComputeThinU) ? k : 0);
|
||||||
|
const Index vtrows = (int_opts & ComputeFullV) ? n_ : ((int_opts & ComputeThinV) ? k : 0);
|
||||||
|
const int64_t ldu = m_;
|
||||||
|
const int64_t ldvt = vtrows > 0 ? vtrows : 1;
|
||||||
|
|
||||||
|
if (ucols > 0) d_U_ = internal::DeviceBuffer(static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar));
|
||||||
|
if (vtrows > 0)
|
||||||
|
d_VT_ = internal::DeviceBuffer(static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar));
|
||||||
|
|
||||||
|
// computeType must match the matrix data type (dtype), not the singular value type (rtype).
|
||||||
|
eigen_assert(m_ >= n_ && "Internal error: m_ < n_ should have been handled by transpose in compute()");
|
||||||
|
size_t dev_ws = 0, host_ws = 0;
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd_bufferSize(
|
||||||
|
handle_, params_.p, jobu(int_opts), jobvt(int_opts), static_cast<int64_t>(m_), static_cast<int64_t>(n_), dtype,
|
||||||
|
d_A_.ptr, lda_, rtype, d_S_.ptr, dtype, ucols > 0 ? d_U_.ptr : nullptr, ldu, dtype,
|
||||||
|
vtrows > 0 ? d_VT_.ptr : nullptr, ldvt, dtype, &dev_ws, &host_ws));
|
||||||
|
|
||||||
|
ensure_scratch(dev_ws);
|
||||||
|
h_workspace_.resize(host_ws);
|
||||||
|
|
||||||
|
// Compute SVD.
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd(handle_, params_.p, jobu(int_opts), jobvt(int_opts), static_cast<int64_t>(m_),
|
||||||
|
static_cast<int64_t>(n_), dtype, d_A_.ptr, lda_, rtype, d_S_.ptr, dtype,
|
||||||
|
ucols > 0 ? d_U_.ptr : nullptr, ldu, dtype, vtrows > 0 ? d_VT_.ptr : nullptr,
|
||||||
|
ldvt, dtype, scratch_workspace(), dev_ws,
|
||||||
|
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Internal solve: X = V * diag(D) * U^H * B, using top `trunc` triplets.
|
||||||
|
// D_ii = 1/S_i (if lambda==0) or S_i/(S_i^2+lambda^2).
|
||||||
|
//
|
||||||
|
// For non-transposed: stored U, VT. X = VT^H * D * U^H * B.
|
||||||
|
// For transposed (SVD of A^H): stored U', VT'. X = U' * D * VT' * B.
|
||||||
|
template <typename Rhs>
|
||||||
|
PlainMatrix solve_impl(const MatrixBase<Rhs>& B, Index trunc, RealScalar lambda) const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success && "GpuSVD::solve called on a failed or uninitialized decomposition");
|
||||||
|
eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "solve requires U");
|
||||||
|
eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "solve requires V");
|
||||||
|
|
||||||
|
const Index m_orig = transposed_ ? n_ : m_;
|
||||||
|
const Index n_orig = transposed_ ? m_ : n_;
|
||||||
|
eigen_assert(B.rows() == m_orig);
|
||||||
|
|
||||||
|
const Index k = (std::min)(m_, n_); // = min(m_orig, n_orig)
|
||||||
|
const Index kk = (std::min)(trunc, k);
|
||||||
|
const Index nrhs = B.cols();
|
||||||
|
|
||||||
|
// Download S to host to build the diagonal scaling.
|
||||||
|
RealVector S(k);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpy(S.data(), d_S_.ptr, static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
|
||||||
|
// Upload B (m_orig × nrhs).
|
||||||
|
const PlainMatrix rhs(B);
|
||||||
|
internal::DeviceBuffer d_B(static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.ptr, rhs.data(),
|
||||||
|
static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||||
|
cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
// Step 1: tmp = U_orig^H * B (kk × nrhs).
|
||||||
|
// Non-transposed: U_stored is m_×ucols, U_orig = U_stored. Use U_stored^H * B.
|
||||||
|
// Transposed: U_orig = VT_stored^H, so U_orig^H = VT_stored. Use VT_stored * B (no transpose!).
|
||||||
|
internal::DeviceBuffer d_tmp(static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar));
|
||||||
|
{
|
||||||
|
Scalar alpha_one(1), beta_zero(0);
|
||||||
|
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
|
||||||
|
constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;
|
||||||
|
|
||||||
|
if (!transposed_) {
|
||||||
|
// U_stored^H * B: (m_×kk)^H × (m_×nrhs) → kk×nrhs.
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs),
|
||||||
|
static_cast<int>(m_), &alpha_one, d_U_.ptr, dtype, static_cast<int>(m_),
|
||||||
|
d_B.ptr, dtype, static_cast<int>(m_orig), &beta_zero, d_tmp.ptr, dtype,
|
||||||
|
static_cast<int>(kk), compute, internal::cuda_gemm_algo()));
|
||||||
|
} else {
|
||||||
|
// VT_stored * B: VT_stored is vtrows×n_ = kk×m_orig (thin), NoTrans.
|
||||||
|
// vtrows×m_orig times m_orig×nrhs → vtrows×nrhs. Use first kk rows.
|
||||||
|
const Index vtrows_stored = (swap_uv_options(options_) & ComputeFullV) ? n_ : k;
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasGemmEx(
|
||||||
|
cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs), static_cast<int>(m_orig),
|
||||||
|
&alpha_one, d_VT_.ptr, dtype, static_cast<int>(vtrows_stored), d_B.ptr, dtype, static_cast<int>(m_orig),
|
||||||
|
&beta_zero, d_tmp.ptr, dtype, static_cast<int>(kk), compute, internal::cuda_gemm_algo()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Scale row i of tmp by D_ii.
|
||||||
|
// Download tmp to host, scale, re-upload. (Simple and correct; a device kernel would be faster.)
|
||||||
|
{
|
||||||
|
PlainMatrix tmp(kk, nrhs);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(tmp.data(), d_tmp.ptr,
|
||||||
|
static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
|
||||||
|
for (Index i = 0; i < kk; ++i) {
|
||||||
|
RealScalar si = S(i);
|
||||||
|
RealScalar di = (lambda == RealScalar(0)) ? (si > 0 ? RealScalar(1) / si : RealScalar(0))
|
||||||
|
: si / (si * si + lambda * lambda);
|
||||||
|
tmp.row(i) *= Scalar(di);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_tmp.ptr, tmp.data(),
|
||||||
|
static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||||
|
cudaMemcpyHostToDevice, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 3: X = V_orig * tmp (n_orig × nrhs).
|
||||||
|
// Non-transposed: V_orig = VT_stored^H. VT_stored[:kk,:]^H * tmp → n_orig × nrhs.
|
||||||
|
// Transposed: V_orig = U_stored[:,:kk]. U_stored * tmp → n_orig × nrhs (NoTrans).
|
||||||
|
PlainMatrix X(n_orig, nrhs);
|
||||||
|
{
|
||||||
|
internal::DeviceBuffer d_X(static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
|
||||||
|
Scalar alpha_one(1), beta_zero(0);
|
||||||
|
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
|
||||||
|
constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;
|
||||||
|
|
||||||
|
if (!transposed_) {
|
||||||
|
const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(n_orig),
|
||||||
|
static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_VT_.ptr, dtype,
|
||||||
|
static_cast<int>(vtrows), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero,
|
||||||
|
d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo()));
|
||||||
|
} else {
|
||||||
|
// U_stored is m_×ucols. V_orig = U_stored[:,:kk]. NoTrans × tmp.
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(n_orig),
|
||||||
|
static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_U_.ptr, dtype,
|
||||||
|
static_cast<int>(m_), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero,
|
||||||
|
d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo()));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_X.ptr,
|
||||||
|
static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
return X;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_SVD_H
|
||||||
321
Eigen/src/GPU/GpuSparseContext.h
Normal file
321
Eigen/src/GPU/GpuSparseContext.h
Normal file
@@ -0,0 +1,321 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix
|
||||||
|
// multiply (SpMM) via cuSPARSE.
|
||||||
|
//
|
||||||
|
// GpuSparseContext manages a cuSPARSE handle and device buffers. It accepts
|
||||||
|
// Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the
|
||||||
|
// GPU. RowMajor input is implicitly converted to ColMajor.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuSparseContext<double> ctx;
|
||||||
|
// VectorXd y = ctx.multiply(A, x); // y = A * x
|
||||||
|
// ctx.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y
|
||||||
|
// VectorXd z = ctx.multiplyT(A, x); // z = A^T * x
|
||||||
|
// MatrixXd Y = ctx.multiplyMat(A, X); // Y = A * X (multiple RHS)
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_SPARSE_CONTEXT_H
|
||||||
|
#define EIGEN_GPU_SPARSE_CONTEXT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuSparseSupport.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class GpuSparseContext {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
using StorageIndex = int;
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
|
||||||
|
using DenseVector = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||||
|
|
||||||
|
GpuSparseContext() {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_));
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
~GpuSparseContext() {
|
||||||
|
destroy_descriptors();
|
||||||
|
if (handle_) (void)cusparseDestroy(handle_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuSparseContext(const GpuSparseContext&) = delete;
|
||||||
|
GpuSparseContext& operator=(const GpuSparseContext&) = delete;
|
||||||
|
|
||||||
|
// ---- SpMV: y = A * x -----------------------------------------------------
|
||||||
|
|
||||||
|
/** Compute y = A * x. Returns y as a new dense vector. */
|
||||||
|
template <typename InputType, typename Rhs>
|
||||||
|
DenseVector multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
|
||||||
|
const SpMat mat(A.derived());
|
||||||
|
DenseVector y(mat.rows());
|
||||||
|
y.setZero();
|
||||||
|
multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
|
||||||
|
return y;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Compute y = alpha * op(A) * x + beta * y (in-place). */
|
||||||
|
template <typename InputType, typename Rhs, typename Dest>
|
||||||
|
void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y,
|
||||||
|
Scalar alpha = Scalar(1), Scalar beta = Scalar(0),
|
||||||
|
cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
|
||||||
|
const SpMat mat(A.derived());
|
||||||
|
multiply_impl(mat, x.derived(), y.derived(), alpha, beta, op);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMV transpose: y = A^T * x -----------------------------------------
|
||||||
|
|
||||||
|
/** Compute y = A^T * x. Returns y as a new dense vector. */
|
||||||
|
template <typename InputType, typename Rhs>
|
||||||
|
DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
|
||||||
|
const SpMat mat(A.derived());
|
||||||
|
DenseVector y(mat.cols());
|
||||||
|
y.setZero();
|
||||||
|
multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
|
||||||
|
return y;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMM: Y = A * X (multiple RHS) --------------------------------------
|
||||||
|
|
||||||
|
/** Compute Y = A * X where X is a dense matrix (multiple RHS). Returns Y. */
|
||||||
|
template <typename InputType, typename Rhs>
|
||||||
|
DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X) {
|
||||||
|
const SpMat mat(A.derived());
|
||||||
|
const DenseMatrix rhs(X.derived());
|
||||||
|
eigen_assert(mat.cols() == rhs.rows());
|
||||||
|
|
||||||
|
const Index m = mat.rows();
|
||||||
|
const Index n = rhs.cols();
|
||||||
|
if (m == 0 || n == 0 || mat.nonZeros() == 0) return DenseMatrix::Zero(m, n);
|
||||||
|
|
||||||
|
DenseMatrix Y = DenseMatrix::Zero(m, n);
|
||||||
|
spmm_impl(mat, rhs, Y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
|
||||||
|
return Y;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors ------------------------------------------------------------
|
||||||
|
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cusparseHandle_t handle_ = nullptr;
|
||||||
|
|
||||||
|
// Cached device buffers (grow-only).
|
||||||
|
internal::DeviceBuffer d_outerPtr_;
|
||||||
|
internal::DeviceBuffer d_innerIdx_;
|
||||||
|
internal::DeviceBuffer d_values_;
|
||||||
|
internal::DeviceBuffer d_x_;
|
||||||
|
internal::DeviceBuffer d_y_;
|
||||||
|
internal::DeviceBuffer d_workspace_;
|
||||||
|
size_t d_outerPtr_size_ = 0;
|
||||||
|
size_t d_innerIdx_size_ = 0;
|
||||||
|
size_t d_values_size_ = 0;
|
||||||
|
size_t d_x_size_ = 0;
|
||||||
|
size_t d_y_size_ = 0;
|
||||||
|
size_t d_workspace_size_ = 0;
|
||||||
|
|
||||||
|
// Cached cuSPARSE descriptors.
|
||||||
|
cusparseSpMatDescr_t spmat_desc_ = nullptr;
|
||||||
|
Index cached_rows_ = -1;
|
||||||
|
Index cached_cols_ = -1;
|
||||||
|
Index cached_nnz_ = -1;
|
||||||
|
|
||||||
|
// ---- SpMV implementation --------------------------------------------------
|
||||||
|
|
||||||
|
template <typename RhsDerived, typename DestDerived>
|
||||||
|
void multiply_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
|
||||||
|
cusparseOperation_t op) {
|
||||||
|
eigen_assert(A.isCompressed());
|
||||||
|
|
||||||
|
const Index m = A.rows();
|
||||||
|
const Index n = A.cols();
|
||||||
|
const Index nnz = A.nonZeros();
|
||||||
|
const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
|
||||||
|
const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
|
||||||
|
|
||||||
|
eigen_assert(x.size() == x_size);
|
||||||
|
eigen_assert(y.size() == y_size);
|
||||||
|
|
||||||
|
if (m == 0 || n == 0 || nnz == 0) {
|
||||||
|
if (beta == Scalar(0))
|
||||||
|
y.setZero();
|
||||||
|
else
|
||||||
|
y *= beta;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Upload sparse matrix to device.
|
||||||
|
upload_sparse(A);
|
||||||
|
|
||||||
|
// Upload x to device.
|
||||||
|
ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar));
|
||||||
|
const DenseVector x_tmp(x);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_x_.ptr, x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
// Upload y to device (for beta != 0).
|
||||||
|
ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar));
|
||||||
|
if (beta != Scalar(0)) {
|
||||||
|
const DenseVector y_tmp(y);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_y_.ptr, y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create dense vector descriptors.
|
||||||
|
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
|
||||||
|
cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr;
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_.ptr, dtype));
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_.ptr, dtype));
|
||||||
|
|
||||||
|
// Query workspace size.
|
||||||
|
size_t ws_size = 0;
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
|
||||||
|
CUSPARSE_SPMV_ALG_DEFAULT, &ws_size));
|
||||||
|
ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
|
||||||
|
|
||||||
|
// Execute SpMV.
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
|
||||||
|
CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.ptr));
|
||||||
|
|
||||||
|
// Download result.
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(y.data(), d_y_.ptr, y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
|
||||||
|
(void)cusparseDestroyDnVec(x_desc);
|
||||||
|
(void)cusparseDestroyDnVec(y_desc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMM implementation --------------------------------------------------
|
||||||
|
|
||||||
|
void spmm_impl(const SpMat& A, const DenseMatrix& X, DenseMatrix& Y, Scalar alpha, Scalar beta,
|
||||||
|
cusparseOperation_t op) {
|
||||||
|
eigen_assert(A.isCompressed());
|
||||||
|
|
||||||
|
const Index m = A.rows();
|
||||||
|
const Index n = X.cols();
|
||||||
|
const Index k = A.cols();
|
||||||
|
const Index nnz = A.nonZeros();
|
||||||
|
|
||||||
|
if (m == 0 || n == 0 || k == 0 || nnz == 0) {
|
||||||
|
if (beta == Scalar(0))
|
||||||
|
Y.setZero();
|
||||||
|
else
|
||||||
|
Y *= beta;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
upload_sparse(A);
|
||||||
|
|
||||||
|
// Upload X to device.
|
||||||
|
const size_t x_bytes = static_cast<size_t>(k) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||||
|
const size_t y_bytes = static_cast<size_t>(m) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||||
|
ensure_buffer(d_x_, d_x_size_, x_bytes);
|
||||||
|
ensure_buffer(d_y_, d_y_size_, y_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_x_.ptr, X.data(), x_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
if (beta != Scalar(0)) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.ptr, Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create dense matrix descriptors.
|
||||||
|
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
|
||||||
|
cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr;
|
||||||
|
// Eigen is column-major, so ld = rows.
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k, n, k, d_x_.ptr, dtype, CUSPARSE_ORDER_COL));
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m, n, m, d_y_.ptr, dtype, CUSPARSE_ORDER_COL));
|
||||||
|
|
||||||
|
// Query workspace.
|
||||||
|
size_t ws_size = 0;
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_,
|
||||||
|
x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size));
|
||||||
|
ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
|
||||||
|
|
||||||
|
// Execute SpMM.
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta,
|
||||||
|
y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.ptr));
|
||||||
|
|
||||||
|
// Download result.
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.ptr, y_bytes, cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
|
||||||
|
(void)cusparseDestroyDnMat(x_desc);
|
||||||
|
(void)cusparseDestroyDnMat(y_desc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Helpers --------------------------------------------------------------
|
||||||
|
|
||||||
|
void upload_sparse(const SpMat& A) {
|
||||||
|
const Index m = A.rows();
|
||||||
|
const Index n = A.cols();
|
||||||
|
const Index nnz = A.nonZeros();
|
||||||
|
|
||||||
|
const size_t outer_bytes = static_cast<size_t>(n + 1) * sizeof(StorageIndex);
|
||||||
|
const size_t inner_bytes = static_cast<size_t>(nnz) * sizeof(StorageIndex);
|
||||||
|
const size_t val_bytes = static_cast<size_t>(nnz) * sizeof(Scalar);
|
||||||
|
|
||||||
|
ensure_buffer(d_outerPtr_, d_outerPtr_size_, outer_bytes);
|
||||||
|
ensure_buffer(d_innerIdx_, d_innerIdx_size_, inner_bytes);
|
||||||
|
ensure_buffer(d_values_, d_values_size_, val_bytes);
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_outerPtr_.ptr, A.outerIndexPtr(), outer_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_innerIdx_.ptr, A.innerIndexPtr(), inner_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, A.valuePtr(), val_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
// Recreate descriptor if shape changed.
|
||||||
|
if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) {
|
||||||
|
destroy_descriptors();
|
||||||
|
|
||||||
|
constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
|
||||||
|
constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value;
|
||||||
|
|
||||||
|
// ColMajor → CSC. outerIndexPtr = col offsets, innerIndexPtr = row indices.
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr,
|
||||||
|
idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO, val_type));
|
||||||
|
cached_rows_ = m;
|
||||||
|
cached_cols_ = n;
|
||||||
|
cached_nnz_ = nnz;
|
||||||
|
} else {
|
||||||
|
// Same shape — just update pointers.
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void destroy_descriptors() {
|
||||||
|
if (spmat_desc_) {
|
||||||
|
(void)cusparseDestroySpMat(spmat_desc_);
|
||||||
|
spmat_desc_ = nullptr;
|
||||||
|
}
|
||||||
|
cached_rows_ = -1;
|
||||||
|
cached_cols_ = -1;
|
||||||
|
cached_nnz_ = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) {
|
||||||
|
if (needed > current_size) {
|
||||||
|
if (buf.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
buf = internal::DeviceBuffer(needed);
|
||||||
|
current_size = needed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_SPARSE_CONTEXT_H
|
||||||
62
Eigen/src/GPU/GpuSparseLDLT.h
Normal file
62
Eigen/src/GPU/GpuSparseLDLT.h
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU sparse LDL^T / LDL^H factorization via cuDSS.
|
||||||
|
//
|
||||||
|
// For symmetric indefinite (or Hermitian indefinite) sparse matrices.
|
||||||
|
// Same three-phase workflow as GpuSparseLLT.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuSparseLDLT<double> ldlt(A); // analyze + factorize
|
||||||
|
// VectorXd x = ldlt.solve(b); // solve
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_SPARSE_LDLT_H
|
||||||
|
#define EIGEN_GPU_SPARSE_LDLT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSparseSolverBase.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
/** GPU sparse LDL^T factorization (symmetric indefinite / Hermitian indefinite).
|
||||||
|
*
|
||||||
|
* Wraps cuDSS with CUDSS_MTYPE_SYMMETRIC (real) or CUDSS_MTYPE_HERMITIAN (complex).
|
||||||
|
* Uses pivoting for numerical stability.
|
||||||
|
*
|
||||||
|
* \tparam Scalar_ float, double, complex<float>, or complex<double>
|
||||||
|
* \tparam UpLo_ Lower (default) or Upper — which triangle of A is stored
|
||||||
|
*/
|
||||||
|
template <typename Scalar_, int UpLo_ = Lower>
|
||||||
|
class GpuSparseLDLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT<Scalar_, UpLo_>> {
|
||||||
|
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT>;
|
||||||
|
friend Base;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
enum { UpLo = UpLo_ };
|
||||||
|
|
||||||
|
GpuSparseLDLT() = default;
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
explicit GpuSparseLDLT(const SparseMatrixBase<InputType>& A) {
|
||||||
|
this->compute(A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr bool needs_csr_conversion() { return false; }
|
||||||
|
static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_symmetric_type<Scalar>::value; }
|
||||||
|
static constexpr cudssMatrixViewType_t cudss_matrix_view() {
|
||||||
|
return internal::cudss_view_type<UpLo, ColMajor>::value;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_SPARSE_LDLT_H
|
||||||
62
Eigen/src/GPU/GpuSparseLLT.h
Normal file
62
Eigen/src/GPU/GpuSparseLLT.h
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU sparse Cholesky (LL^T / LL^H) via cuDSS.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuSparseLLT<double> llt(A); // analyze + factorize
|
||||||
|
// VectorXd x = llt.solve(b); // solve
|
||||||
|
// llt.analyzePattern(A); // or separate phases
|
||||||
|
// llt.factorize(A_new); // reuse symbolic analysis
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_SPARSE_LLT_H
|
||||||
|
#define EIGEN_GPU_SPARSE_LLT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSparseSolverBase.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
/** GPU sparse Cholesky factorization (LL^T for real, LL^H for complex).
|
||||||
|
*
|
||||||
|
* Wraps cuDSS with CUDSS_MTYPE_SPD (real) or CUDSS_MTYPE_HPD (complex).
|
||||||
|
* Accepts ColMajor SparseMatrix (CSC), reinterpreted as CSR with swapped
|
||||||
|
* triangle view for zero-copy upload.
|
||||||
|
*
|
||||||
|
* \tparam Scalar_ float, double, complex<float>, or complex<double>
|
||||||
|
* \tparam UpLo_ Lower (default) or Upper — which triangle of A is stored
|
||||||
|
*/
|
||||||
|
template <typename Scalar_, int UpLo_ = Lower>
|
||||||
|
class GpuSparseLLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT<Scalar_, UpLo_>> {
|
||||||
|
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT>;
|
||||||
|
friend Base;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
enum { UpLo = UpLo_ };
|
||||||
|
|
||||||
|
GpuSparseLLT() = default;
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
explicit GpuSparseLLT(const SparseMatrixBase<InputType>& A) {
|
||||||
|
this->compute(A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr bool needs_csr_conversion() { return false; }
|
||||||
|
static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_spd_type<Scalar>::value; }
|
||||||
|
static constexpr cudssMatrixViewType_t cudss_matrix_view() {
|
||||||
|
return internal::cudss_view_type<UpLo, ColMajor>::value;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_SPARSE_LLT_H
|
||||||
59
Eigen/src/GPU/GpuSparseLU.h
Normal file
59
Eigen/src/GPU/GpuSparseLU.h
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU sparse LU factorization via cuDSS.
|
||||||
|
//
|
||||||
|
// For general (non-symmetric) sparse matrices. Uses pivoting.
|
||||||
|
// Same three-phase workflow as GpuSparseLLT.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuSparseLU<double> lu(A); // analyze + factorize
|
||||||
|
// VectorXd x = lu.solve(b); // solve
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_SPARSE_LU_H
|
||||||
|
#define EIGEN_GPU_SPARSE_LU_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSparseSolverBase.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
/** GPU sparse LU factorization (general matrices).
|
||||||
|
*
|
||||||
|
* Wraps cuDSS with CUDSS_MTYPE_GENERAL and CUDSS_MVIEW_FULL.
|
||||||
|
* Accepts ColMajor SparseMatrix (CSC); internally converts to RowMajor
|
||||||
|
* CSR since cuDSS requires CSR input.
|
||||||
|
*
|
||||||
|
* \tparam Scalar_ float, double, complex<float>, or complex<double>
|
||||||
|
*/
|
||||||
|
template <typename Scalar_>
|
||||||
|
class GpuSparseLU : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLU<Scalar_>> {
|
||||||
|
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLU>;
|
||||||
|
friend Base;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
|
||||||
|
GpuSparseLU() = default;
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
explicit GpuSparseLU(const SparseMatrixBase<InputType>& A) {
|
||||||
|
this->compute(A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr bool needs_csr_conversion() { return true; }
|
||||||
|
static constexpr cudssMatrixType_t cudss_matrix_type() { return CUDSS_MTYPE_GENERAL; }
|
||||||
|
static constexpr cudssMatrixViewType_t cudss_matrix_view() { return CUDSS_MVIEW_FULL; }
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_SPARSE_LU_H
|
||||||
356
Eigen/src/GPU/GpuSparseSolverBase.h
Normal file
356
Eigen/src/GPU/GpuSparseSolverBase.h
Normal file
@@ -0,0 +1,356 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Common base for GPU sparse direct solvers (LLT, LDLT, LU) via cuDSS.
|
||||||
|
//
|
||||||
|
// All three solver types share the same three-phase workflow
|
||||||
|
// (analyzePattern → factorize → solve) and differ only in the
|
||||||
|
// cudssMatrixType_t and cudssMatrixViewType_t passed to cuDSS.
|
||||||
|
// This CRTP base implements the entire workflow; derived classes
|
||||||
|
// provide the matrix type/view via static constexpr members.
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_SPARSE_SOLVER_BASE_H
|
||||||
|
#define EIGEN_GPU_SPARSE_SOLVER_BASE_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuDssSupport.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
/** CRTP base for GPU sparse direct solvers.
|
||||||
|
*
|
||||||
|
* \tparam Scalar_ Element type (passed explicitly to avoid incomplete-type issues with CRTP).
|
||||||
|
* \tparam Derived The concrete solver class (GpuSparseLLT, GpuSparseLDLT, GpuSparseLU).
|
||||||
|
* Must provide:
|
||||||
|
* - `static constexpr cudssMatrixType_t cudss_matrix_type()`
|
||||||
|
* - `static constexpr cudssMatrixViewType_t cudss_matrix_view()`
|
||||||
|
*/
|
||||||
|
template <typename Scalar_, typename Derived>
|
||||||
|
class GpuSparseSolverBase {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
using StorageIndex = int;
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
|
||||||
|
using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>;
|
||||||
|
using DenseVector = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||||
|
|
||||||
|
GpuSparseSolverBase() { init_context(); }
|
||||||
|
|
||||||
|
~GpuSparseSolverBase() {
|
||||||
|
destroy_cudss_objects();
|
||||||
|
if (handle_) (void)cudssDestroy(handle_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuSparseSolverBase(const GpuSparseSolverBase&) = delete;
|
||||||
|
GpuSparseSolverBase& operator=(const GpuSparseSolverBase&) = delete;
|
||||||
|
|
||||||
|
// ---- Configuration --------------------------------------------------------
|
||||||
|
|
||||||
|
/** Set the fill-reducing ordering algorithm. Must be called before compute/analyzePattern. */
|
||||||
|
void setOrdering(GpuSparseOrdering ordering) { ordering_ = ordering; }
|
||||||
|
|
||||||
|
// ---- Factorization --------------------------------------------------------
|
||||||
|
|
||||||
|
/** Symbolic analysis + numeric factorization. */
|
||||||
|
template <typename InputType>
|
||||||
|
Derived& compute(const SparseMatrixBase<InputType>& A) {
|
||||||
|
analyzePattern(A);
|
||||||
|
if (info_ == Success) {
|
||||||
|
factorize(A);
|
||||||
|
}
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Symbolic analysis only. Uploads sparsity structure to device.
|
||||||
|
* This phase is synchronous (blocks until complete). */
|
||||||
|
template <typename InputType>
|
||||||
|
Derived& analyzePattern(const SparseMatrixBase<InputType>& A) {
|
||||||
|
const SpMat csc(A.derived());
|
||||||
|
eigen_assert(csc.rows() == csc.cols() && "GpuSparseSolver requires a square matrix");
|
||||||
|
eigen_assert(csc.isCompressed() && "GpuSparseSolver requires a compressed sparse matrix");
|
||||||
|
|
||||||
|
n_ = csc.rows();
|
||||||
|
info_ = InvalidInput;
|
||||||
|
analysis_done_ = false;
|
||||||
|
|
||||||
|
if (n_ == 0) {
|
||||||
|
nnz_ = 0;
|
||||||
|
info_ = Success;
|
||||||
|
analysis_done_ = true;
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
// For symmetric solvers, ColMajor CSC can be reinterpreted as CSR with
|
||||||
|
// swapped triangle view (zero copy). For general solvers, we must convert
|
||||||
|
// to actual RowMajor CSR so cuDSS sees the correct matrix, not A^T.
|
||||||
|
if (Derived::needs_csr_conversion()) {
|
||||||
|
const CsrMat csr(csc);
|
||||||
|
nnz_ = csr.nonZeros();
|
||||||
|
upload_csr(csr);
|
||||||
|
} else {
|
||||||
|
nnz_ = csc.nonZeros();
|
||||||
|
upload_csr_from_csc(csc);
|
||||||
|
}
|
||||||
|
create_cudss_matrix();
|
||||||
|
apply_ordering_config();
|
||||||
|
|
||||||
|
if (data_) EIGEN_CUDSS_CHECK(cudssDataDestroy(handle_, data_));
|
||||||
|
EIGEN_CUDSS_CHECK(cudssDataCreate(handle_, &data_));
|
||||||
|
|
||||||
|
create_placeholder_dense();
|
||||||
|
|
||||||
|
EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_ANALYSIS, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
|
||||||
|
|
||||||
|
analysis_done_ = true;
|
||||||
|
info_ = Success;
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Numeric factorization using the symbolic analysis from analyzePattern.
|
||||||
|
*
|
||||||
|
* \warning The sparsity pattern (outerIndexPtr, innerIndexPtr) must be
|
||||||
|
* identical to the one passed to analyzePattern(). Only the numerical
|
||||||
|
* values may change. Passing a different pattern is undefined behavior.
|
||||||
|
* This matches the contract of CHOLMOD, UMFPACK, and cuDSS's own API.
|
||||||
|
*
|
||||||
|
* This phase is asynchronous — info() lazily synchronizes. */
|
||||||
|
template <typename InputType>
|
||||||
|
Derived& factorize(const SparseMatrixBase<InputType>& A) {
|
||||||
|
eigen_assert(analysis_done_ && "factorize() requires analyzePattern() first");
|
||||||
|
|
||||||
|
if (n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to the same format used in analyzePattern.
|
||||||
|
// Both temporaries must outlive the async memcpy (pageable H2D is actually
|
||||||
|
// synchronous w.r.t. the host, but keep them alive for clarity).
|
||||||
|
const SpMat csc(A.derived());
|
||||||
|
eigen_assert(csc.rows() == n_ && csc.cols() == n_);
|
||||||
|
|
||||||
|
const Scalar* value_ptr;
|
||||||
|
Index value_nnz;
|
||||||
|
CsrMat csr_tmp;
|
||||||
|
if (Derived::needs_csr_conversion()) {
|
||||||
|
csr_tmp = CsrMat(csc);
|
||||||
|
value_ptr = csr_tmp.valuePtr();
|
||||||
|
value_nnz = csr_tmp.nonZeros();
|
||||||
|
} else {
|
||||||
|
value_ptr = csc.valuePtr();
|
||||||
|
value_nnz = csc.nonZeros();
|
||||||
|
}
|
||||||
|
eigen_assert(value_nnz == nnz_);
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, value_ptr, static_cast<size_t>(nnz_) * sizeof(Scalar),
|
||||||
|
cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
EIGEN_CUDSS_CHECK(cudssMatrixSetValues(d_A_cudss_, d_values_.ptr));
|
||||||
|
|
||||||
|
info_ = InvalidInput;
|
||||||
|
info_synced_ = false;
|
||||||
|
EIGEN_CUDSS_CHECK(
|
||||||
|
cudssExecute(handle_, CUDSS_PHASE_FACTORIZATION, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
|
||||||
|
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve ----------------------------------------------------------------
|
||||||
|
|
||||||
|
/** Solve A * X = B. Returns X as a dense matrix.
|
||||||
|
* Supports single or multiple right-hand sides. */
|
||||||
|
template <typename Rhs>
|
||||||
|
DenseMatrix solve(const MatrixBase<Rhs>& B) const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success && "GpuSparseSolver::solve requires a successful factorization");
|
||||||
|
eigen_assert(B.rows() == n_);
|
||||||
|
|
||||||
|
const DenseMatrix rhs(B);
|
||||||
|
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||||
|
|
||||||
|
if (n_ == 0) return DenseMatrix(0, rhs.cols());
|
||||||
|
|
||||||
|
const size_t rhs_bytes = static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||||
|
DeviceBuffer d_b(rhs_bytes);
|
||||||
|
DeviceBuffer d_x(rhs_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_b.ptr, rhs.data(), rhs_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||||
|
cudssMatrix_t b_cudss = nullptr, x_cudss = nullptr;
|
||||||
|
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&b_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_b.ptr,
|
||||||
|
dtype, CUDSS_LAYOUT_COL_MAJOR));
|
||||||
|
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&x_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_x.ptr,
|
||||||
|
dtype, CUDSS_LAYOUT_COL_MAJOR));
|
||||||
|
|
||||||
|
EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_SOLVE, config_, data_, d_A_cudss_, x_cudss, b_cudss));
|
||||||
|
|
||||||
|
DenseMatrix X(n_, rhs.cols());
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_x.ptr, rhs_bytes, cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
|
||||||
|
(void)cudssMatrixDestroy(b_cudss);
|
||||||
|
(void)cudssMatrixDestroy(x_cudss);
|
||||||
|
|
||||||
|
return X;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors ------------------------------------------------------------
|
||||||
|
|
||||||
|
ComputationInfo info() const {
|
||||||
|
sync_info();
|
||||||
|
return info_;
|
||||||
|
}
|
||||||
|
Index rows() const { return n_; }
|
||||||
|
Index cols() const { return n_; }
|
||||||
|
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
protected:
|
||||||
|
// ---- CUDA / cuDSS handles -------------------------------------------------
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cudssHandle_t handle_ = nullptr;
|
||||||
|
cudssConfig_t config_ = nullptr;
|
||||||
|
cudssData_t data_ = nullptr;
|
||||||
|
cudssMatrix_t d_A_cudss_ = nullptr;
|
||||||
|
cudssMatrix_t d_x_cudss_ = nullptr;
|
||||||
|
cudssMatrix_t d_b_cudss_ = nullptr;
|
||||||
|
|
||||||
|
// ---- Device buffers for CSR arrays ----------------------------------------
|
||||||
|
DeviceBuffer d_rowPtr_;
|
||||||
|
DeviceBuffer d_colIdx_;
|
||||||
|
DeviceBuffer d_values_;
|
||||||
|
|
||||||
|
// ---- State ----------------------------------------------------------------
|
||||||
|
Index n_ = 0;
|
||||||
|
Index nnz_ = 0;
|
||||||
|
ComputationInfo info_ = InvalidInput;
|
||||||
|
bool info_synced_ = true;
|
||||||
|
bool analysis_done_ = false;
|
||||||
|
GpuSparseOrdering ordering_ = GpuSparseOrdering::AMD;
|
||||||
|
|
||||||
|
private:
|
||||||
|
Derived& derived() { return static_cast<Derived&>(*this); }
|
||||||
|
const Derived& derived() const { return static_cast<const Derived&>(*this); }
|
||||||
|
|
||||||
|
void init_context() {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
EIGEN_CUDSS_CHECK(cudssCreate(&handle_));
|
||||||
|
EIGEN_CUDSS_CHECK(cudssSetStream(handle_, stream_));
|
||||||
|
EIGEN_CUDSS_CHECK(cudssConfigCreate(&config_));
|
||||||
|
}
|
||||||
|
|
||||||
|
void sync_info() const {
|
||||||
|
if (!info_synced_) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
int cudss_info = 0;
|
||||||
|
EIGEN_CUDSS_CHECK(cudssDataGet(handle_, data_, CUDSS_DATA_INFO, &cudss_info, sizeof(cudss_info), nullptr));
|
||||||
|
auto* self = const_cast<GpuSparseSolverBase*>(this);
|
||||||
|
self->info_ = (cudss_info == 0) ? Success : NumericalIssue;
|
||||||
|
self->info_synced_ = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void destroy_cudss_objects() {
|
||||||
|
if (d_A_cudss_) {
|
||||||
|
(void)cudssMatrixDestroy(d_A_cudss_);
|
||||||
|
d_A_cudss_ = nullptr;
|
||||||
|
}
|
||||||
|
if (d_x_cudss_) {
|
||||||
|
(void)cudssMatrixDestroy(d_x_cudss_);
|
||||||
|
d_x_cudss_ = nullptr;
|
||||||
|
}
|
||||||
|
if (d_b_cudss_) {
|
||||||
|
(void)cudssMatrixDestroy(d_b_cudss_);
|
||||||
|
d_b_cudss_ = nullptr;
|
||||||
|
}
|
||||||
|
if (data_) {
|
||||||
|
(void)cudssDataDestroy(handle_, data_);
|
||||||
|
data_ = nullptr;
|
||||||
|
}
|
||||||
|
if (config_) {
|
||||||
|
(void)cudssConfigDestroy(config_);
|
||||||
|
config_ = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Upload CSR from a RowMajor sparse matrix (native CSR).
|
||||||
|
void upload_csr(const CsrMat& csr) { upload_compressed(csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr()); }
|
||||||
|
|
||||||
|
// Upload CSC arrays reinterpreted as CSR (for symmetric matrices: CSC(A) = CSR(A^T) = CSR(A)).
|
||||||
|
void upload_csr_from_csc(const SpMat& csc) {
|
||||||
|
upload_compressed(csc.outerIndexPtr(), csc.innerIndexPtr(), csc.valuePtr());
|
||||||
|
}
|
||||||
|
|
||||||
|
void upload_compressed(const StorageIndex* outer, const StorageIndex* inner, const Scalar* values) {
|
||||||
|
const size_t rowptr_bytes = static_cast<size_t>(n_ + 1) * sizeof(StorageIndex);
|
||||||
|
const size_t colidx_bytes = static_cast<size_t>(nnz_) * sizeof(StorageIndex);
|
||||||
|
const size_t values_bytes = static_cast<size_t>(nnz_) * sizeof(Scalar);
|
||||||
|
|
||||||
|
d_rowPtr_ = DeviceBuffer(rowptr_bytes);
|
||||||
|
d_colIdx_ = DeviceBuffer(colidx_bytes);
|
||||||
|
d_values_ = DeviceBuffer(values_bytes);
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_rowPtr_.ptr, outer, rowptr_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_colIdx_.ptr, inner, colidx_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, values, values_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
void create_cudss_matrix() {
|
||||||
|
if (d_A_cudss_) (void)cudssMatrixDestroy(d_A_cudss_);
|
||||||
|
|
||||||
|
constexpr cudaDataType_t idx_type = cudss_index_type<StorageIndex>::value;
|
||||||
|
constexpr cudaDataType_t val_type = cuda_data_type<Scalar>::value;
|
||||||
|
constexpr cudssMatrixType_t mtype = Derived::cudss_matrix_type();
|
||||||
|
constexpr cudssMatrixViewType_t mview = Derived::cudss_matrix_view();
|
||||||
|
|
||||||
|
EIGEN_CUDSS_CHECK(cudssMatrixCreateCsr(
|
||||||
|
&d_A_cudss_, static_cast<int64_t>(n_), static_cast<int64_t>(n_), static_cast<int64_t>(nnz_), d_rowPtr_.ptr,
|
||||||
|
/*rowEnd=*/nullptr, d_colIdx_.ptr, d_values_.ptr, idx_type, val_type, mtype, mview, CUDSS_BASE_ZERO));
|
||||||
|
}
|
||||||
|
|
||||||
|
void apply_ordering_config() {
|
||||||
|
cudssAlgType_t alg;
|
||||||
|
switch (ordering_) {
|
||||||
|
case GpuSparseOrdering::AMD:
|
||||||
|
alg = CUDSS_ALG_DEFAULT;
|
||||||
|
break;
|
||||||
|
case GpuSparseOrdering::METIS:
|
||||||
|
alg = CUDSS_ALG_2;
|
||||||
|
break;
|
||||||
|
case GpuSparseOrdering::RCM:
|
||||||
|
alg = CUDSS_ALG_3;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
alg = CUDSS_ALG_DEFAULT;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
EIGEN_CUDSS_CHECK(cudssConfigSet(config_, CUDSS_CONFIG_REORDERING_ALG, &alg, sizeof(alg)));
|
||||||
|
}
|
||||||
|
|
||||||
|
void create_placeholder_dense() {
|
||||||
|
if (d_x_cudss_) (void)cudssMatrixDestroy(d_x_cudss_);
|
||||||
|
if (d_b_cudss_) (void)cudssMatrixDestroy(d_b_cudss_);
|
||||||
|
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||||
|
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_x_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
|
||||||
|
dtype, CUDSS_LAYOUT_COL_MAJOR));
|
||||||
|
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_b_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
|
||||||
|
dtype, CUDSS_LAYOUT_COL_MAJOR));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_SPARSE_SOLVER_BASE_H
|
||||||
@@ -1,8 +1,8 @@
|
|||||||
# Eigen GPU Module (`Eigen/GPU`)
|
# Eigen GPU Module (`Eigen/GPU`)
|
||||||
|
|
||||||
GPU-accelerated dense linear algebra for Eigen users, dispatching to NVIDIA
|
GPU-accelerated linear algebra for Eigen users, dispatching to NVIDIA CUDA
|
||||||
CUDA libraries (cuBLAS, cuSOLVER). Requires CUDA 11.4+. Header-only (link
|
libraries (cuBLAS, cuSOLVER, cuFFT, cuSPARSE, cuDSS). Requires CUDA 11.4+;
|
||||||
against CUDA runtime, cuBLAS, and cuSOLVER).
|
cuDSS features require CUDA 12.0+ and a separate cuDSS install. Header-only.
|
||||||
|
|
||||||
## Why this module
|
## Why this module
|
||||||
|
|
||||||
@@ -10,25 +10,31 @@ Eigen is the linear algebra foundation for a large ecosystem of C++ projects
|
|||||||
in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP,
|
in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP,
|
||||||
Open3D), scientific computing (Ceres, Stan), and beyond. Many of these
|
Open3D), scientific computing (Ceres, Stan), and beyond. Many of these
|
||||||
projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations
|
projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations
|
||||||
without dropping down to raw CUDA library APIs. Third-party projects like
|
without dropping down to raw CUDA library APIs.
|
||||||
[EigenCuda](https://github.com/NLESC-JCER/EigenCuda) and
|
|
||||||
[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically to fill
|
|
||||||
this gap, and downstream projects like
|
|
||||||
[Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
|
|
||||||
[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
|
|
||||||
GPU-accelerated solvers through Eigen.
|
|
||||||
|
|
||||||
The `Eigen/GPU` module aims to close this gap: Existing Eigen users should be
|
GPU sparse solvers are a particularly acute gap. Sparse factorization is the
|
||||||
able to move performance-critical dense linear algebra to the GPU with minimal
|
bottleneck in SLAM, bundle adjustment, FEM, and nonlinear optimization --
|
||||||
code changes and without learning CUDA library APIs directly.
|
exactly the workloads where GPU acceleration matters most. Downstream projects
|
||||||
|
like [Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
|
||||||
|
[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
|
||||||
|
GPU-accelerated sparse solvers, and third-party projects like
|
||||||
|
[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically because
|
||||||
|
Eigen lacks them. The `Eigen/GPU` module provides GPU sparse Cholesky, LDL^T,
|
||||||
|
and LU factorization via cuDSS, alongside dense solvers (cuSOLVER), matrix
|
||||||
|
products (cuBLAS), FFT (cuFFT), and sparse matrix-vector products (cuSPARSE).
|
||||||
|
|
||||||
|
Existing Eigen users should be able to move performance-critical dense or
|
||||||
|
sparse linear algebra to the GPU with minimal code changes and without
|
||||||
|
learning CUDA library APIs directly.
|
||||||
|
|
||||||
## Design philosophy
|
## Design philosophy
|
||||||
|
|
||||||
**CPU and GPU coexist.** There is no global compile-time switch that replaces
|
**CPU and GPU coexist.** There is no global compile-time switch that replaces
|
||||||
CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers
|
CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers
|
||||||
explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>` -- and both coexist in
|
explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>`, `GpuSparseLLT<double>` vs
|
||||||
the same binary. This also lets users keep the factored matrix on device across
|
`SimplicialLLT<SparseMatrix<double>>` -- and both coexist in the same binary.
|
||||||
multiple solves, something impossible with compile-time replacement.
|
This also lets users keep the factored matrix on device across multiple solves,
|
||||||
|
something impossible with compile-time replacement.
|
||||||
|
|
||||||
**Familiar syntax.** GPU operations use the same expression patterns as CPU
|
**Familiar syntax.** GPU operations use the same expression patterns as CPU
|
||||||
Eigen. Here is a side-by-side comparison:
|
Eigen. Here is a side-by-side comparison:
|
||||||
@@ -38,6 +44,7 @@ Eigen. Here is a side-by-side comparison:
|
|||||||
#include <Eigen/Dense> #define EIGEN_USE_GPU
|
#include <Eigen/Dense> #define EIGEN_USE_GPU
|
||||||
#include <Eigen/GPU>
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
// Dense
|
||||||
MatrixXd A = ...; auto d_A = DeviceMatrix<double>::fromHost(A);
|
MatrixXd A = ...; auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||||
MatrixXd B = ...; auto d_B = DeviceMatrix<double>::fromHost(B);
|
MatrixXd B = ...; auto d_B = DeviceMatrix<double>::fromHost(B);
|
||||||
|
|
||||||
@@ -45,11 +52,15 @@ MatrixXd C = A * B; DeviceMatrix<double> d_C = d_A * d_B;
|
|||||||
MatrixXd X = A.llt().solve(B); DeviceMatrix<double> d_X = d_A.llt().solve(d_B);
|
MatrixXd X = A.llt().solve(B); DeviceMatrix<double> d_X = d_A.llt().solve(d_B);
|
||||||
|
|
||||||
MatrixXd X = d_X.toHost();
|
MatrixXd X = d_X.toHost();
|
||||||
|
|
||||||
|
// Sparse (using SpMat = SparseMatrix<double>)
|
||||||
|
SimplicialLLT<SpMat> llt(A); GpuSparseLLT<double> llt(A);
|
||||||
|
VectorXd x = llt.solve(b); VectorXd x = llt.solve(b);
|
||||||
```
|
```
|
||||||
|
|
||||||
The GPU version reads like CPU Eigen with explicit upload/download.
|
The GPU version reads like CPU Eigen with explicit upload/download for dense
|
||||||
`operator*` dispatches to cuBLAS GEMM, `.llt().solve()` dispatches to
|
operations, and an almost identical API for sparse solvers. Unsupported
|
||||||
cuSOLVER potrf + potrs. Unsupported expressions are compile errors.
|
expressions are compile errors.
|
||||||
|
|
||||||
**Explicit over implicit.** Host-device transfers, stream management, and
|
**Explicit over implicit.** Host-device transfers, stream management, and
|
||||||
library handle lifetimes are visible in the API. There are no hidden
|
library handle lifetimes are visible in the API. There are no hidden
|
||||||
@@ -160,10 +171,109 @@ MatrixXd X2 = d_X2.toHost();
|
|||||||
GpuLU<double> lu;
|
GpuLU<double> lu;
|
||||||
lu.compute(d_A);
|
lu.compute(d_A);
|
||||||
auto d_Y = lu.solve(d_B, GpuLU<double>::Transpose); // A^T Y = B
|
auto d_Y = lu.solve(d_B, GpuLU<double>::Transpose); // A^T Y = B
|
||||||
|
|
||||||
|
// QR solve (overdetermined least squares)
|
||||||
|
GpuQR<double> qr;
|
||||||
|
qr.compute(d_A); // factorize on device (async)
|
||||||
|
auto d_X = qr.solve(d_B); // Q^H * B via ormqr, then trsm on R
|
||||||
|
MatrixXd X = d_X.toHost();
|
||||||
|
|
||||||
|
// SVD (results downloaded on access)
|
||||||
|
GpuSVD<double> svd;
|
||||||
|
svd.compute(d_A, ComputeThinU | ComputeThinV);
|
||||||
|
VectorXd S = svd.singularValues(); // downloads to host
|
||||||
|
MatrixXd U = svd.matrixU(); // downloads to host
|
||||||
|
MatrixXd VT = svd.matrixVT(); // V^T (matches cuSOLVER)
|
||||||
|
|
||||||
|
// Self-adjoint eigenvalue decomposition (results downloaded on access)
|
||||||
|
GpuSelfAdjointEigenSolver<double> es;
|
||||||
|
es.compute(d_A);
|
||||||
|
VectorXd eigenvals = es.eigenvalues(); // downloads to host
|
||||||
|
MatrixXd eigenvecs = es.eigenvectors(); // downloads to host
|
||||||
```
|
```
|
||||||
|
|
||||||
The cached API keeps the factored matrix on device, avoiding redundant
|
The cached API keeps the factored matrix on device, avoiding redundant
|
||||||
host-device transfers and re-factorizations.
|
host-device transfers and re-factorizations. All solvers also accept host
|
||||||
|
matrices directly as a convenience (e.g., `GpuLLT<double> llt(A)` or
|
||||||
|
`qr.solve(B)`), which handles upload/download internally.
|
||||||
|
|
||||||
|
### Sparse direct solvers (cuDSS)
|
||||||
|
|
||||||
|
Requires cuDSS (separate install, CUDA 12.0+). Define `EIGEN_CUDSS` before
|
||||||
|
including `Eigen/GPU` and link with `-lcudss`.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
SparseMatrix<double> A = ...; // symmetric positive definite
|
||||||
|
VectorXd b = ...;
|
||||||
|
|
||||||
|
// Sparse Cholesky -- one-liner
|
||||||
|
GpuSparseLLT<double> llt(A);
|
||||||
|
VectorXd x = llt.solve(b);
|
||||||
|
|
||||||
|
// Three-phase workflow for repeated solves with the same sparsity pattern
|
||||||
|
GpuSparseLLT<double> llt;
|
||||||
|
llt.analyzePattern(A); // symbolic analysis (once)
|
||||||
|
llt.factorize(A); // numeric factorization
|
||||||
|
VectorXd x = llt.solve(b);
|
||||||
|
llt.factorize(A_new_values); // refactorize (reuses symbolic analysis)
|
||||||
|
VectorXd x2 = llt.solve(b);
|
||||||
|
|
||||||
|
// Sparse LDL^T (symmetric indefinite)
|
||||||
|
GpuSparseLDLT<double> ldlt(A);
|
||||||
|
VectorXd x = ldlt.solve(b);
|
||||||
|
|
||||||
|
// Sparse LU (general non-symmetric)
|
||||||
|
GpuSparseLU<double> lu(A);
|
||||||
|
VectorXd x = lu.solve(b);
|
||||||
|
```
|
||||||
|
|
||||||
|
### FFT (cuFFT)
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuFFT<float> fft;
|
||||||
|
|
||||||
|
// 1D complex-to-complex
|
||||||
|
VectorXcf X = fft.fwd(x); // forward
|
||||||
|
VectorXcf y = fft.inv(X); // inverse (scaled by 1/n)
|
||||||
|
|
||||||
|
// 1D real-to-complex / complex-to-real
|
||||||
|
VectorXcf R = fft.fwd(r); // returns n/2+1 complex (half-spectrum)
|
||||||
|
VectorXf s = fft.invReal(R, n); // C2R inverse, caller specifies n
|
||||||
|
|
||||||
|
// 2D complex-to-complex
|
||||||
|
MatrixXcf B = fft.fwd2d(A); // 2D forward
|
||||||
|
MatrixXcf C = fft.inv2d(B); // 2D inverse (scaled by 1/(rows*cols))
|
||||||
|
|
||||||
|
// Plans are cached and reused across calls with the same size/type.
|
||||||
|
```
|
||||||
|
|
||||||
|
### Sparse matrix-vector multiply (cuSPARSE)
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
SparseMatrix<double> A = ...;
|
||||||
|
VectorXd x = ...;
|
||||||
|
|
||||||
|
GpuSparseContext<double> ctx;
|
||||||
|
VectorXd y = ctx.multiply(A, x); // y = A * x
|
||||||
|
VectorXd z = ctx.multiplyT(A, x); // z = A^T * x
|
||||||
|
ctx.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y
|
||||||
|
|
||||||
|
// Multiple RHS (SpMM)
|
||||||
|
MatrixXd Y = ctx.multiplyMat(A, X); // Y = A * X
|
||||||
|
```
|
||||||
|
|
||||||
|
### Precision control
|
||||||
|
|
||||||
|
GEMM dispatch enables tensor core algorithms by default, allowing cuBLAS to
|
||||||
|
choose the fastest algorithm for the given precision and architecture. For
|
||||||
|
double precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64
|
||||||
|
results computed faster via tensor cores.
|
||||||
|
|
||||||
|
| Macro | Effect |
|
||||||
|
|---|---|
|
||||||
|
| *(default)* | Tensor core algorithms enabled. Float uses full FP32. Double may use Ozaki on sm_80+. |
|
||||||
|
| `EIGEN_CUDA_TF32` | Opt-in: Float uses TF32 (~2x faster, 10-bit mantissa). Double unaffected. |
|
||||||
|
| `EIGEN_NO_CUDA_TENSOR_OPS` | Opt-out: Pedantic compute types, no tensor cores. For bit-exact reproducibility. |
|
||||||
|
|
||||||
### Stream control and async execution
|
### Stream control and async execution
|
||||||
|
|
||||||
@@ -190,7 +300,8 @@ skip the wait (CUDA guarantees in-order execution within a stream).
|
|||||||
|
|
||||||
### Supported scalar types
|
### Supported scalar types
|
||||||
|
|
||||||
`float`, `double`, `std::complex<float>`, `std::complex<double>`.
|
`float`, `double`, `std::complex<float>`, `std::complex<double>` (unless
|
||||||
|
noted otherwise).
|
||||||
|
|
||||||
### Expression -> library call mapping
|
### Expression -> library call mapping
|
||||||
|
|
||||||
@@ -212,29 +323,41 @@ skip the wait (CUDA guarantees in-order execution within a stream).
|
|||||||
| `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
|
| `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
|
||||||
| `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
|
| `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
|
||||||
|
|
||||||
### `DeviceMatrix<Scalar>` API
|
### `DeviceMatrix<Scalar>`
|
||||||
|
|
||||||
| Method | Sync? | Description |
|
Typed RAII wrapper for a dense column-major matrix in GPU device memory.
|
||||||
|--------|-------|-------------|
|
Always dense (leading dimension = rows). A vector is a `DeviceMatrix` with
|
||||||
| `DeviceMatrix()` | -- | Empty (0x0) |
|
one column.
|
||||||
| `DeviceMatrix(rows, cols)` | -- | Allocate uninitialized |
|
|
||||||
| `fromHost(matrix, stream)` | yes | Upload from Eigen matrix |
|
```cpp
|
||||||
| `fromHostAsync(ptr, rows, cols, outerStride, stream)` | no | Async upload (caller manages lifetime) |
|
// Construction
|
||||||
| `toHost(stream)` | yes | Synchronous download |
|
DeviceMatrix<Scalar>() // Empty (0x0)
|
||||||
| `toHostAsync(stream)` | no | Returns `HostTransfer` future |
|
DeviceMatrix<Scalar>(rows, cols) // Allocate uninitialized
|
||||||
| `clone(stream)` | no | Device-to-device deep copy |
|
|
||||||
| `resize(rows, cols)` | -- | Discard contents, reallocate |
|
// Upload / download
|
||||||
| `data()` | -- | Raw device pointer |
|
static DeviceMatrix fromHost(matrix, stream=nullptr) // -> DeviceMatrix (syncs)
|
||||||
| `rows()`, `cols()` | -- | Dimensions |
|
static DeviceMatrix fromHostAsync(ptr, rows, cols, outerStride, s) // -> DeviceMatrix (no sync, caller manages ptr lifetime)
|
||||||
| `sizeInBytes()` | -- | Total device allocation size in bytes |
|
PlainMatrix toHost(stream=nullptr) // -> host Matrix (syncs)
|
||||||
| `empty()` | -- | True if 0x0 |
|
HostTransfer toHostAsync(stream=nullptr) // -> HostTransfer future (no sync)
|
||||||
| `adjoint()` | -- | Adjoint view (GEMM ConjTrans) |
|
DeviceMatrix clone(stream=nullptr) // -> DeviceMatrix (D2D copy, async)
|
||||||
| `transpose()` | -- | Transpose view (GEMM Trans) |
|
|
||||||
| `llt()` / `llt<UpLo>()` | -- | Cholesky expression builder |
|
// Dimensions and access
|
||||||
| `lu()` | -- | LU expression builder |
|
Index rows()
|
||||||
| `triangularView<UpLo>()` | -- | Triangular view (TRSM) |
|
Index cols()
|
||||||
| `selfadjointView<UpLo>()` | -- | Self-adjoint view (SYMM, rankUpdate) |
|
size_t sizeInBytes()
|
||||||
| `device(ctx)` | -- | Assignment proxy bound to context |
|
bool empty()
|
||||||
|
Scalar* data() // Raw device pointer
|
||||||
|
void resize(Index rows, Index cols) // Discard contents, reallocate
|
||||||
|
|
||||||
|
// Expression builders (return lightweight views, evaluated on assignment)
|
||||||
|
AdjointView adjoint() // GEMM with ConjTrans
|
||||||
|
TransposeView transpose() // GEMM with Trans
|
||||||
|
LltExpr llt() / llt<UpLo>() // -> .solve(d_B) -> DeviceMatrix
|
||||||
|
LuExpr lu() // -> .solve(d_B) -> DeviceMatrix
|
||||||
|
TriangularView triangularView<UpLo>() // -> .solve(d_B) -> DeviceMatrix (TRSM)
|
||||||
|
SelfAdjointView selfadjointView<UpLo>() // -> * d_B (SYMM), .rankUpdate(d_A) (SYRK)
|
||||||
|
DeviceAssignment device(GpuContext& ctx) // Bind assignment to explicit stream
|
||||||
|
```
|
||||||
|
|
||||||
### `GpuContext`
|
### `GpuContext`
|
||||||
|
|
||||||
@@ -251,34 +374,190 @@ cusolverDnHandle_t cusolverHandle()
|
|||||||
|
|
||||||
Non-copyable, non-movable (owns library handles).
|
Non-copyable, non-movable (owns library handles).
|
||||||
|
|
||||||
### `GpuLLT<Scalar, UpLo>` API
|
### `GpuLLT<Scalar, UpLo>` -- Dense Cholesky (cuSOLVER)
|
||||||
|
|
||||||
GPU dense Cholesky (LL^T) via cuSOLVER. Caches factor on device.
|
Caches the Cholesky factor on device for repeated solves.
|
||||||
|
|
||||||
| Method | Sync? | Description |
|
```cpp
|
||||||
|--------|-------|-------------|
|
GpuLLT() // Default construct, then call compute()
|
||||||
| `GpuLLT(A)` | deferred | Construct and factorize from host matrix |
|
GpuLLT(const EigenBase<D>& A) // Convenience: upload + factorize
|
||||||
| `compute(host_matrix)` | deferred | Upload and factorize |
|
|
||||||
| `compute(DeviceMatrix)` | deferred | D2D copy and factorize |
|
|
||||||
| `compute(DeviceMatrix&&)` | deferred | Move-adopt and factorize (no copy) |
|
|
||||||
| `solve(host_matrix)` | yes | Solve, return host matrix |
|
|
||||||
| `solve(DeviceMatrix)` | no | Solve, return `DeviceMatrix` (async) |
|
|
||||||
| `info()` | lazy | Syncs stream on first call, returns `Success` or `NumericalIssue` |
|
|
||||||
|
|
||||||
### `GpuLU<Scalar>` API
|
GpuLLT& compute(const EigenBase<D>& A) // Upload + factorize
|
||||||
|
GpuLLT& compute(const DeviceMatrix& d_A) // D2D copy + factorize
|
||||||
|
GpuLLT& compute(DeviceMatrix&& d_A) // Adopt + factorize (no copy)
|
||||||
|
|
||||||
GPU dense partial-pivoting LU via cuSOLVER. Same pattern as `GpuLLT`, plus
|
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
|
||||||
`TransposeMode` parameter on `solve()` (`NoTranspose`, `Transpose`,
|
DeviceMatrix solve(const DeviceMatrix& d_B) // -> DeviceMatrix (async, stays on device)
|
||||||
`ConjugateTranspose`).
|
|
||||||
|
|
||||||
### `HostTransfer<Scalar>` API
|
ComputationInfo info() // Lazy sync on first call: Success or NumericalIssue
|
||||||
|
Index rows() / cols()
|
||||||
|
cudaStream_t stream()
|
||||||
|
```
|
||||||
|
|
||||||
Future for async device-to-host transfer.
|
### `GpuLU<Scalar>` -- Dense LU (cuSOLVER)
|
||||||
|
|
||||||
| Method | Description |
|
Same pattern as `GpuLLT`. Adds `TransposeMode` parameter on `solve()`.
|
||||||
|--------|-------------|
|
|
||||||
| `get()` | Block until transfer completes, return host matrix reference. Idempotent. |
|
```cpp
|
||||||
| `ready()` | Non-blocking poll |
|
PlainMatrix solve(const MatrixBase<D>& B, TransposeMode m = NoTranspose) // -> host Matrix
|
||||||
|
DeviceMatrix solve(const DeviceMatrix& d_B, TransposeMode m = NoTranspose) // -> DeviceMatrix
|
||||||
|
```
|
||||||
|
|
||||||
|
`TransposeMode`: `NoTranspose`, `Transpose`, `ConjugateTranspose`.
|
||||||
|
|
||||||
|
### `GpuQR<Scalar>` -- Dense QR (cuSOLVER)
|
||||||
|
|
||||||
|
QR factorization via `cusolverDnXgeqrf`. Solve uses ORMQR (apply Q^H) + TRSM
|
||||||
|
(back-substitute on R) -- Q is never formed explicitly.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuQR() // Default construct
|
||||||
|
GpuQR(const EigenBase<D>& A) // Convenience: upload + factorize
|
||||||
|
|
||||||
|
GpuQR& compute(const EigenBase<D>& A) // Upload + factorize
|
||||||
|
GpuQR& compute(const DeviceMatrix& d_A) // D2D copy + factorize
|
||||||
|
|
||||||
|
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
|
||||||
|
DeviceMatrix solve(const DeviceMatrix& d_B) // -> DeviceMatrix (async)
|
||||||
|
|
||||||
|
ComputationInfo info() // Lazy sync
|
||||||
|
Index rows() / cols()
|
||||||
|
cudaStream_t stream()
|
||||||
|
```
|
||||||
|
|
||||||
|
### `GpuSVD<Scalar>` -- Dense SVD (cuSOLVER)
|
||||||
|
|
||||||
|
SVD via `cusolverDnXgesvd`. Supports `ComputeThinU | ComputeThinV`,
|
||||||
|
`ComputeFullU | ComputeFullV`, or `0` (values only). Wide matrices (m < n)
|
||||||
|
handled by internal transpose.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuSVD() // Default construct, then call compute()
|
||||||
|
GpuSVD(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV) // Convenience
|
||||||
|
|
||||||
|
GpuSVD& compute(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV)
|
||||||
|
GpuSVD& compute(const DeviceMatrix& d_A, unsigned options = ComputeThinU | ComputeThinV)
|
||||||
|
|
||||||
|
RealVector singularValues() // -> host vector (syncs, downloads)
|
||||||
|
PlainMatrix matrixU() // -> host Matrix (syncs, downloads)
|
||||||
|
PlainMatrix matrixVT() // -> host Matrix (syncs, downloads V^T)
|
||||||
|
|
||||||
|
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (pseudoinverse)
|
||||||
|
PlainMatrix solve(const MatrixBase<D>& B, Index k) // Truncated (top k triplets)
|
||||||
|
PlainMatrix solve(const MatrixBase<D>& B, RealScalar l) // Tikhonov regularized
|
||||||
|
|
||||||
|
Index rank(RealScalar threshold = -1)
|
||||||
|
ComputationInfo info() // Lazy sync
|
||||||
|
Index rows() / cols()
|
||||||
|
cudaStream_t stream()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** `singularValues()`, `matrixU()`, and `matrixVT()` download to host
|
||||||
|
on each call. Device-side accessors returning `DeviceMatrix` are planned but
|
||||||
|
not yet implemented.
|
||||||
|
|
||||||
|
### `GpuSelfAdjointEigenSolver<Scalar>` -- Eigendecomposition (cuSOLVER)
|
||||||
|
|
||||||
|
Symmetric/Hermitian eigenvalue decomposition via `cusolverDnXsyevd`.
|
||||||
|
`ComputeMode` enum: `EigenvaluesOnly`, `ComputeEigenvectors`.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuSelfAdjointEigenSolver() // Default construct, then call compute()
|
||||||
|
GpuSelfAdjointEigenSolver(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors) // Convenience
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver& compute(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors)
|
||||||
|
GpuSelfAdjointEigenSolver& compute(const DeviceMatrix& d_A, ComputeMode mode = ComputeEigenvectors)
|
||||||
|
|
||||||
|
RealVector eigenvalues() // -> host vector (syncs, downloads, ascending order)
|
||||||
|
PlainMatrix eigenvectors() // -> host Matrix (syncs, downloads, columns)
|
||||||
|
|
||||||
|
ComputationInfo info() // Lazy sync
|
||||||
|
Index rows() / cols()
|
||||||
|
cudaStream_t stream()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** `eigenvalues()` and `eigenvectors()` download to host on each call.
|
||||||
|
Device-side accessors returning `DeviceMatrix` are planned but not yet
|
||||||
|
implemented.
|
||||||
|
|
||||||
|
### `HostTransfer<Scalar>`
|
||||||
|
|
||||||
|
Future for async device-to-host transfer. Returned by
|
||||||
|
`DeviceMatrix::toHostAsync()`.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
PlainMatrix& get() // Block until complete, return host Matrix ref. Idempotent.
|
||||||
|
bool ready() // Non-blocking poll
|
||||||
|
```
|
||||||
|
|
||||||
|
### `GpuSparseLLT<Scalar, UpLo>` -- Sparse Cholesky (cuDSS)
|
||||||
|
|
||||||
|
Requires cuDSS (CUDA 12.0+, `#define EIGEN_CUDSS`). Three-phase workflow
|
||||||
|
with symbolic reuse. Accepts `SparseMatrix<Scalar, ColMajor, int>` (CSC).
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuSparseLLT() // Default construct
|
||||||
|
GpuSparseLLT(const SparseMatrixBase<D>& A) // Analyze + factorize
|
||||||
|
|
||||||
|
GpuSparseLLT& analyzePattern(const SparseMatrixBase<D>& A) // Symbolic analysis (reusable)
|
||||||
|
GpuSparseLLT& factorize(const SparseMatrixBase<D>& A) // Numeric factorization
|
||||||
|
GpuSparseLLT& compute(const SparseMatrixBase<D>& A) // analyzePattern + factorize
|
||||||
|
void setOrdering(GpuSparseOrdering ord) // AMD (default), METIS, or RCM
|
||||||
|
|
||||||
|
DenseMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
|
||||||
|
|
||||||
|
ComputationInfo info() // Lazy sync
|
||||||
|
Index rows() / cols()
|
||||||
|
cudaStream_t stream()
|
||||||
|
```
|
||||||
|
|
||||||
|
### `GpuSparseLDLT<Scalar, UpLo>` -- Sparse LDL^T (cuDSS)
|
||||||
|
|
||||||
|
Symmetric indefinite. Same API as `GpuSparseLLT`.
|
||||||
|
|
||||||
|
### `GpuSparseLU<Scalar>` -- Sparse LU (cuDSS)
|
||||||
|
|
||||||
|
General non-symmetric. Same API as `GpuSparseLLT` (without `UpLo`).
|
||||||
|
|
||||||
|
### `GpuFFT<Scalar>` -- FFT (cuFFT)
|
||||||
|
|
||||||
|
Plans cached by (size, type) and reused. Inverse transforms scaled so
|
||||||
|
`inv(fwd(x)) == x`. Supported scalars: `float`, `double`.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
// 1D transforms (host vectors in and out)
|
||||||
|
ComplexVector fwd(const MatrixBase<D>& x) // C2C forward (complex input)
|
||||||
|
ComplexVector fwd(const MatrixBase<D>& x) // R2C forward (real input, returns n/2+1)
|
||||||
|
ComplexVector inv(const MatrixBase<D>& X) // C2C inverse, scaled by 1/n
|
||||||
|
RealVector invReal(const MatrixBase<D>& X, Index n) // C2R inverse, scaled by 1/n
|
||||||
|
|
||||||
|
// 2D transforms (host matrices in and out)
|
||||||
|
ComplexMatrix fwd2d(const MatrixBase<D>& A) // 2D C2C forward
|
||||||
|
ComplexMatrix inv2d(const MatrixBase<D>& A) // 2D C2C inverse, scaled by 1/(rows*cols)
|
||||||
|
|
||||||
|
cudaStream_t stream()
|
||||||
|
```
|
||||||
|
|
||||||
|
All FFT methods accept host data and return host data. Upload/download is
|
||||||
|
handled internally. The C2C and R2C overloads of `fwd()` are distinguished by
|
||||||
|
the input scalar type (complex vs real).
|
||||||
|
|
||||||
|
### `GpuSparseContext<Scalar>` -- SpMV/SpMM (cuSPARSE)
|
||||||
|
|
||||||
|
Accepts `SparseMatrix<Scalar, ColMajor>`. All methods accept host data and
|
||||||
|
return host data.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuSparseContext() // Creates own stream + cuSPARSE handle
|
||||||
|
|
||||||
|
DenseVector multiply(A, x) // y = A * x
|
||||||
|
void multiply(A, x, y, alpha=1, beta=0, // y = alpha*op(A)*x + beta*y
|
||||||
|
op=CUSPARSE_OPERATION_NON_TRANSPOSE)
|
||||||
|
DenseVector multiplyT(A, x) // y = A^T * x
|
||||||
|
DenseMatrix multiplyMat(A, X) // Y = A * X (SpMM)
|
||||||
|
|
||||||
|
cudaStream_t stream()
|
||||||
|
```
|
||||||
|
|
||||||
### Aliasing
|
### Aliasing
|
||||||
|
|
||||||
@@ -303,6 +582,18 @@ The caller must ensure operands don't alias the destination for GEMM and TRSM
|
|||||||
| `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
|
| `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
|
||||||
| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
|
| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
|
||||||
| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
|
| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
|
||||||
|
| `GpuQR.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense QR decomposition |
|
||||||
|
| `GpuSVD.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense SVD decomposition |
|
||||||
|
| `GpuEigenSolver.h` | `CuSolverSupport.h` | Self-adjoint eigenvalue decomposition |
|
||||||
|
| `CuFftSupport.h` | `GpuSupport.h`, `<cufft.h>` | cuFFT error macro, type-dispatch wrappers |
|
||||||
|
| `GpuFFT.h` | `CuFftSupport.h`, `CuBlasSupport.h` | 1D/2D FFT with plan caching |
|
||||||
|
| `CuSparseSupport.h` | `GpuSupport.h`, `<cusparse.h>` | cuSPARSE error macro |
|
||||||
|
| `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE |
|
||||||
|
| `CuDssSupport.h` | `GpuSupport.h`, `<cudss.h>` | cuDSS error macro, type traits (optional) |
|
||||||
|
| `GpuSparseSolverBase.h` | `CuDssSupport.h` | CRTP base for sparse solvers (optional) |
|
||||||
|
| `GpuSparseLLT.h` | `GpuSparseSolverBase.h` | Sparse Cholesky via cuDSS (optional) |
|
||||||
|
| `GpuSparseLDLT.h` | `GpuSparseSolverBase.h` | Sparse LDL^T via cuDSS (optional) |
|
||||||
|
| `GpuSparseLU.h` | `GpuSparseSolverBase.h` | Sparse LU via cuDSS (optional) |
|
||||||
|
|
||||||
## Building and testing
|
## Building and testing
|
||||||
|
|
||||||
@@ -313,6 +604,33 @@ cmake -G Ninja -B build -S . \
|
|||||||
-DEIGEN_TEST_CUBLAS=ON \
|
-DEIGEN_TEST_CUBLAS=ON \
|
||||||
-DEIGEN_TEST_CUSOLVER=ON
|
-DEIGEN_TEST_CUSOLVER=ON
|
||||||
|
|
||||||
cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu gpu_device_matrix
|
cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu \
|
||||||
ctest --test-dir build -R "gpu_cublas|gpu_cusolver|gpu_device" --output-on-failure
|
gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen \
|
||||||
|
gpu_device_matrix gpu_cufft gpu_cusparse_spmv
|
||||||
|
ctest --test-dir build -R "gpu_" --output-on-failure
|
||||||
|
|
||||||
|
# Sparse solvers (cuDSS -- separate install required)
|
||||||
|
cmake -G Ninja -B build -S . \
|
||||||
|
-DEIGEN_TEST_CUDA=ON \
|
||||||
|
-DEIGEN_CUDA_COMPUTE_ARCH="70" \
|
||||||
|
-DEIGEN_TEST_CUDSS=ON
|
||||||
|
|
||||||
|
cmake --build build --target gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu
|
||||||
|
ctest --test-dir build -R gpu_cudss --output-on-failure
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Future work
|
||||||
|
|
||||||
|
- **Device-side accessors for decomposition results.** `GpuSVD`,
|
||||||
|
`GpuSelfAdjointEigenSolver`, and `GpuQR` currently download decomposition
|
||||||
|
results to host on access (e.g., `svd.matrixU()` returns a host `MatrixXd`).
|
||||||
|
Device-side accessors returning `DeviceMatrix` views of the internal buffers
|
||||||
|
would allow chaining GPU operations (e.g., `svd.deviceU() * d_A`) without
|
||||||
|
round-tripping through host memory.
|
||||||
|
- **Device-resident sparse matrix-vector products.** `GpuSparseContext`
|
||||||
|
currently operates on host vectors and matrices, uploading and downloading
|
||||||
|
on each call. The key missing piece is a `DeviceSparseView` that holds a
|
||||||
|
sparse matrix on device and supports operator syntax (`d_y = d_A * d_x`)
|
||||||
|
with `DeviceMatrix` operands -- keeping the entire SpMV/SpMM pipeline on
|
||||||
|
device. This is essential for iterative solvers and any workflow that chains
|
||||||
|
sparse and dense operations without returning to the host.
|
||||||
|
|||||||
@@ -51,3 +51,7 @@ eigen_add_gpu_benchmark(bench_gpu_chaining_float bench_gpu_chaining.cpp DEFINITI
|
|||||||
# Batching benchmarks: multi-stream concurrency for many small systems.
|
# Batching benchmarks: multi-stream concurrency for many small systems.
|
||||||
eigen_add_gpu_benchmark(bench_gpu_batching bench_gpu_batching.cpp)
|
eigen_add_gpu_benchmark(bench_gpu_batching bench_gpu_batching.cpp)
|
||||||
eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITIONS SCALAR=float)
|
eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITIONS SCALAR=float)
|
||||||
|
|
||||||
|
# FFT benchmarks: 1D/2D C2C, R2C, C2R throughput and plan reuse.
|
||||||
|
eigen_add_gpu_benchmark(bench_gpu_fft bench_gpu_fft.cpp LIBRARIES CUDA::cufft)
|
||||||
|
eigen_add_gpu_benchmark(bench_gpu_fft_double bench_gpu_fft.cpp LIBRARIES CUDA::cufft DEFINITIONS SCALAR=double)
|
||||||
|
|||||||
185
benchmarks/GPU/bench_gpu_fft.cpp
Normal file
185
benchmarks/GPU/bench_gpu_fft.cpp
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
// GPU FFT benchmarks: GpuFFT 1D and 2D throughput.
|
||||||
|
//
|
||||||
|
// Measures forward and inverse FFT performance across a range of sizes,
|
||||||
|
// including plan-amortized (reuse) and cold-start (new plan) scenarios.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// cmake --build build-bench-gpu --target bench_gpu_fft
|
||||||
|
// ./build-bench-gpu/bench_gpu_fft
|
||||||
|
//
|
||||||
|
// Profiling:
|
||||||
|
// nsys profile --trace=cuda ./build-bench-gpu/bench_gpu_fft
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
#ifndef SCALAR
|
||||||
|
#define SCALAR float
|
||||||
|
#endif
|
||||||
|
|
||||||
|
using Scalar = SCALAR;
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using CVec = Matrix<Complex, Dynamic, 1>;
|
||||||
|
using RVec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using CMat = Matrix<Complex, Dynamic, Dynamic>;
|
||||||
|
|
||||||
|
// CUDA warm-up: ensure the GPU is initialized before timing.
|
||||||
|
static void cuda_warmup() {
|
||||||
|
static bool done = false;
|
||||||
|
if (!done) {
|
||||||
|
void* p;
|
||||||
|
cudaMalloc(&p, 1);
|
||||||
|
cudaFree(p);
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// 1D C2C Forward
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuFFT_1D_C2C_Fwd(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
CVec x = CVec::Random(n);
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
|
||||||
|
// Warm up plan.
|
||||||
|
CVec tmp = fft.fwd(x);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
benchmark::DoNotOptimize(fft.fwd(x));
|
||||||
|
}
|
||||||
|
state.SetItemsProcessed(state.iterations() * n);
|
||||||
|
state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2); // read + write
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuFFT_1D_C2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// 1D C2C Inverse
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuFFT_1D_C2C_Inv(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
CVec x = CVec::Random(n);
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
CVec X = fft.fwd(x);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
benchmark::DoNotOptimize(fft.inv(X));
|
||||||
|
}
|
||||||
|
state.SetItemsProcessed(state.iterations() * n);
|
||||||
|
state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuFFT_1D_C2C_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// 1D R2C Forward
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuFFT_1D_R2C_Fwd(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
RVec r = RVec::Random(n);
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
|
||||||
|
// Warm up plan.
|
||||||
|
CVec tmp = fft.fwd(r);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
benchmark::DoNotOptimize(fft.fwd(r));
|
||||||
|
}
|
||||||
|
state.SetItemsProcessed(state.iterations() * n);
|
||||||
|
state.SetBytesProcessed(state.iterations() * (n * sizeof(Scalar) + (n / 2 + 1) * sizeof(Complex)));
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuFFT_1D_R2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// 1D C2R Inverse
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuFFT_1D_C2R_Inv(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
RVec r = RVec::Random(n);
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
CVec R = fft.fwd(r);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
benchmark::DoNotOptimize(fft.invReal(R, n));
|
||||||
|
}
|
||||||
|
state.SetItemsProcessed(state.iterations() * n);
|
||||||
|
state.SetBytesProcessed(state.iterations() * ((n / 2 + 1) * sizeof(Complex) + n * sizeof(Scalar)));
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuFFT_1D_C2R_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// 2D C2C Forward
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuFFT_2D_C2C_Fwd(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0); // square n x n
|
||||||
|
CMat A = CMat::Random(n, n);
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
|
||||||
|
// Warm up plan.
|
||||||
|
CMat tmp = fft.fwd2d(A);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
benchmark::DoNotOptimize(fft.fwd2d(A));
|
||||||
|
}
|
||||||
|
state.SetItemsProcessed(state.iterations() * n * n);
|
||||||
|
state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuFFT_2D_C2C_Fwd)->RangeMultiplier(2)->Range(64, 4096);
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// 2D C2C Roundtrip (fwd + inv)
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuFFT_2D_C2C_Roundtrip(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
CMat A = CMat::Random(n, n);
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
|
||||||
|
// Warm up plans.
|
||||||
|
CMat tmp = fft.inv2d(fft.fwd2d(A));
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
CMat B = fft.fwd2d(A);
|
||||||
|
benchmark::DoNotOptimize(fft.inv2d(B));
|
||||||
|
}
|
||||||
|
state.SetItemsProcessed(state.iterations() * n * n * 2); // fwd + inv
|
||||||
|
state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuFFT_2D_C2C_Roundtrip)->RangeMultiplier(2)->Range(64, 4096);
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// 1D Cold start (includes plan creation)
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuFFT_1D_ColdStart(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
CVec x = CVec::Random(n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
GpuFFT<Scalar> fft; // new object = new plans
|
||||||
|
benchmark::DoNotOptimize(fft.fwd(x));
|
||||||
|
}
|
||||||
|
state.SetItemsProcessed(state.iterations() * n);
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuFFT_1D_ColdStart)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
|
||||||
@@ -528,7 +528,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
|||||||
# compiler and linked against CUDA runtime + cuSOLVER. This avoids NVCC
|
# compiler and linked against CUDA runtime + cuSOLVER. This avoids NVCC
|
||||||
# instantiating Eigen's CPU packet operations for CUDA vector types.
|
# instantiating Eigen's CPU packet operations for CUDA vector types.
|
||||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
foreach(_cusolver_test IN ITEMS gpu_cusolver_llt gpu_cusolver_lu)
|
foreach(_cusolver_test IN ITEMS gpu_cusolver_llt gpu_cusolver_lu gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen)
|
||||||
add_executable(${_cusolver_test} ${_cusolver_test}.cpp)
|
add_executable(${_cusolver_test} ${_cusolver_test}.cpp)
|
||||||
target_include_directories(${_cusolver_test} PRIVATE
|
target_include_directories(${_cusolver_test} PRIVATE
|
||||||
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||||
@@ -547,11 +547,86 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
|||||||
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# cuFFT test (cuFFT is part of the CUDA toolkit — no separate option needed).
|
||||||
|
if(TARGET CUDA::cufft)
|
||||||
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
|
add_executable(gpu_cufft gpu_cufft.cpp)
|
||||||
|
target_include_directories(gpu_cufft PRIVATE
|
||||||
|
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||||
|
target_link_libraries(gpu_cufft
|
||||||
|
Eigen3::Eigen CUDA::cudart CUDA::cufft CUDA::cublas)
|
||||||
|
target_compile_definitions(gpu_cufft PRIVATE
|
||||||
|
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||||
|
EIGEN_TEST_PART_ALL=1)
|
||||||
|
add_test(NAME gpu_cufft COMMAND gpu_cufft)
|
||||||
|
add_dependencies(buildtests gpu_cufft)
|
||||||
|
add_dependencies(buildtests_gpu gpu_cufft)
|
||||||
|
set_property(TEST gpu_cufft APPEND PROPERTY LABELS "Official;gpu")
|
||||||
|
set_property(TEST gpu_cufft PROPERTY SKIP_RETURN_CODE 77)
|
||||||
|
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# cuSPARSE SpMV test (cuSPARSE is part of the CUDA toolkit).
|
||||||
|
if(TARGET CUDA::cusparse)
|
||||||
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
|
add_executable(gpu_cusparse_spmv gpu_cusparse_spmv.cpp)
|
||||||
|
target_include_directories(gpu_cusparse_spmv PRIVATE
|
||||||
|
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||||
|
target_link_libraries(gpu_cusparse_spmv
|
||||||
|
Eigen3::Eigen CUDA::cudart CUDA::cusparse)
|
||||||
|
target_compile_definitions(gpu_cusparse_spmv PRIVATE
|
||||||
|
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||||
|
EIGEN_TEST_PART_ALL=1)
|
||||||
|
add_test(NAME gpu_cusparse_spmv COMMAND gpu_cusparse_spmv)
|
||||||
|
add_dependencies(buildtests gpu_cusparse_spmv)
|
||||||
|
add_dependencies(buildtests_gpu gpu_cusparse_spmv)
|
||||||
|
set_property(TEST gpu_cusparse_spmv APPEND PROPERTY LABELS "Official;gpu")
|
||||||
|
set_property(TEST gpu_cusparse_spmv PROPERTY SKIP_RETURN_CODE 77)
|
||||||
|
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||||
|
endif()
|
||||||
|
|
||||||
option(EIGEN_TEST_CUSPARSE "Test cuSPARSE integration" OFF)
|
option(EIGEN_TEST_CUSPARSE "Test cuSPARSE integration" OFF)
|
||||||
if(EIGEN_TEST_CUSPARSE AND TARGET CUDA::cusparse)
|
if(EIGEN_TEST_CUSPARSE AND TARGET CUDA::cusparse)
|
||||||
ei_add_test(gpu_cusparse "" "CUDA::cusparse")
|
ei_add_test(gpu_cusparse "" "CUDA::cusparse")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# cuDSS sparse direct solver tests.
|
||||||
|
# cuDSS is distributed separately from the CUDA Toolkit.
|
||||||
|
option(EIGEN_TEST_CUDSS "Test cuDSS sparse solver integration" OFF)
|
||||||
|
if(EIGEN_TEST_CUDSS)
|
||||||
|
find_path(CUDSS_INCLUDE_DIR cudss.h
|
||||||
|
HINTS ${CUDSS_DIR}/include ${CUDA_TOOLKIT_ROOT_DIR}/include /usr/include)
|
||||||
|
find_library(CUDSS_LIBRARY cudss
|
||||||
|
HINTS ${CUDSS_DIR}/lib ${CUDSS_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib/x86_64-linux-gnu)
|
||||||
|
if(CUDSS_INCLUDE_DIR AND CUDSS_LIBRARY)
|
||||||
|
message(STATUS "cuDSS found: ${CUDSS_LIBRARY}")
|
||||||
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
|
foreach(_cudss_test IN ITEMS gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu)
|
||||||
|
add_executable(${_cudss_test} ${_cudss_test}.cpp)
|
||||||
|
target_include_directories(${_cudss_test} PRIVATE
|
||||||
|
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||||
|
"${CUDSS_INCLUDE_DIR}"
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||||
|
target_link_libraries(${_cudss_test}
|
||||||
|
Eigen3::Eigen CUDA::cudart CUDA::cusolver CUDA::cublas ${CUDSS_LIBRARY})
|
||||||
|
target_compile_definitions(${_cudss_test} PRIVATE
|
||||||
|
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||||
|
EIGEN_TEST_PART_ALL=1
|
||||||
|
EIGEN_CUDSS=1)
|
||||||
|
add_test(NAME ${_cudss_test} COMMAND "${_cudss_test}")
|
||||||
|
add_dependencies(buildtests ${_cudss_test})
|
||||||
|
add_dependencies(buildtests_gpu ${_cudss_test})
|
||||||
|
set_property(TEST ${_cudss_test} APPEND PROPERTY LABELS "Official;gpu")
|
||||||
|
set_property(TEST ${_cudss_test} PROPERTY SKIP_RETURN_CODE 77)
|
||||||
|
endforeach()
|
||||||
|
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||||
|
else()
|
||||||
|
message(WARNING "EIGEN_TEST_CUDSS=ON but cuDSS not found. Set CUDSS_DIR.")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
@@ -16,6 +16,32 @@
|
|||||||
|
|
||||||
using namespace Eigen;
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// Unit roundoff for GPU GEMM compute precision.
|
||||||
|
// TF32 (opt-in via EIGEN_CUDA_TF32) has eps ~ 2^{-10}.
|
||||||
|
template <typename Scalar>
|
||||||
|
typename NumTraits<Scalar>::Real gpu_unit_roundoff() {
|
||||||
|
#if defined(EIGEN_CUDA_TF32) && !defined(EIGEN_NO_CUDA_TENSOR_OPS)
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
if (std::is_same<RealScalar, float>::value) return RealScalar(9.8e-4);
|
||||||
|
#endif
|
||||||
|
return NumTraits<Scalar>::epsilon();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Higham-Mary probabilistic error bound for GEMM:
|
||||||
|
// ||C - fl(C)||_F <= lambda * sqrt(k) * u * ||A||_F * ||B||_F
|
||||||
|
// where k is the inner dimension, u is the unit roundoff, and
|
||||||
|
// lambda = sqrt(2 * ln(2/delta)) with delta = failure probability.
|
||||||
|
// lambda = 5 corresponds to delta ~ 10^{-6}.
|
||||||
|
// Reference: Higham & Mary, "Probabilistic Error Analysis for Inner Products",
|
||||||
|
// SIAM J. Matrix Anal. Appl., 2019.
|
||||||
|
template <typename Scalar>
|
||||||
|
typename NumTraits<Scalar>::Real gemm_error_bound(Index k, typename NumTraits<Scalar>::Real normA,
|
||||||
|
typename NumTraits<Scalar>::Real normB) {
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
constexpr RealScalar lambda = 5;
|
||||||
|
return lambda * std::sqrt(static_cast<RealScalar>(k)) * gpu_unit_roundoff<Scalar>() * normA * normB;
|
||||||
|
}
|
||||||
|
|
||||||
// ---- Basic GEMM: C = A * B -------------------------------------------------
|
// ---- Basic GEMM: C = A * B -------------------------------------------------
|
||||||
|
|
||||||
template <typename Scalar>
|
template <typename Scalar>
|
||||||
@@ -36,7 +62,7 @@ void test_gemm_basic(Index m, Index n, Index k) {
|
|||||||
Mat C = d_C.toHost();
|
Mat C = d_C.toHost();
|
||||||
Mat C_ref = A * B;
|
Mat C_ref = A * B;
|
||||||
|
|
||||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
VERIFY((C - C_ref).norm() < tol);
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -59,7 +85,7 @@ void test_gemm_adjoint_lhs(Index m, Index n, Index k) {
|
|||||||
Mat C = d_C.toHost();
|
Mat C = d_C.toHost();
|
||||||
Mat C_ref = A.adjoint() * B;
|
Mat C_ref = A.adjoint() * B;
|
||||||
|
|
||||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
VERIFY((C - C_ref).norm() < tol);
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -82,7 +108,7 @@ void test_gemm_transpose_rhs(Index m, Index n, Index k) {
|
|||||||
Mat C = d_C.toHost();
|
Mat C = d_C.toHost();
|
||||||
Mat C_ref = A * B.transpose();
|
Mat C_ref = A * B.transpose();
|
||||||
|
|
||||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
VERIFY((C - C_ref).norm() < tol);
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -106,7 +132,7 @@ void test_gemm_scaled(Index m, Index n, Index k) {
|
|||||||
Mat C = d_C.toHost();
|
Mat C = d_C.toHost();
|
||||||
Mat C_ref = alpha * A * B;
|
Mat C_ref = alpha * A * B;
|
||||||
|
|
||||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
VERIFY((C - C_ref).norm() < tol);
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -130,7 +156,7 @@ void test_gemm_accumulate(Index m, Index n, Index k) {
|
|||||||
Mat C = d_C.toHost();
|
Mat C = d_C.toHost();
|
||||||
Mat C_ref = C_init + A * B;
|
Mat C_ref = C_init + A * B;
|
||||||
|
|
||||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
VERIFY((C - C_ref).norm() < tol);
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -153,7 +179,7 @@ void test_gemm_accumulate_empty(Index m, Index n, Index k) {
|
|||||||
Mat C = d_C.toHost();
|
Mat C = d_C.toHost();
|
||||||
Mat C_ref = A * B;
|
Mat C_ref = A * B;
|
||||||
|
|
||||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
VERIFY((C - C_ref).norm() < tol);
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -178,7 +204,7 @@ void test_gemm_subtract(Index m, Index n, Index k) {
|
|||||||
Mat C = d_C.toHost();
|
Mat C = d_C.toHost();
|
||||||
Mat C_ref = C_init - A * B;
|
Mat C_ref = C_init - A * B;
|
||||||
|
|
||||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
VERIFY((C - C_ref).norm() < tol);
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -202,7 +228,7 @@ void test_gemm_subtract_empty(Index m, Index n, Index k) {
|
|||||||
Mat C = d_C.toHost();
|
Mat C = d_C.toHost();
|
||||||
Mat C_ref = -(A * B);
|
Mat C_ref = -(A * B);
|
||||||
|
|
||||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
VERIFY((C - C_ref).norm() < tol);
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -226,7 +252,7 @@ void test_gemm_scaled_rhs(Index m, Index n, Index k) {
|
|||||||
Mat C = d_C.toHost();
|
Mat C = d_C.toHost();
|
||||||
Mat C_ref = A * (alpha * B);
|
Mat C_ref = A * (alpha * B);
|
||||||
|
|
||||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
VERIFY((C - C_ref).norm() < tol);
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -266,7 +292,7 @@ void test_gemm_explicit_context(Index m, Index n, Index k) {
|
|||||||
Mat C = d_C.toHost();
|
Mat C = d_C.toHost();
|
||||||
Mat C_ref = A * B;
|
Mat C_ref = A * B;
|
||||||
|
|
||||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
VERIFY((C - C_ref).norm() < tol);
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -296,7 +322,7 @@ void test_gemm_cross_context_reuse(Index n) {
|
|||||||
Mat C = d_C.toHost();
|
Mat C = d_C.toHost();
|
||||||
Mat C_ref = A * B + D * E;
|
Mat C_ref = A * B + D * E;
|
||||||
|
|
||||||
RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm()) + gemm_error_bound<Scalar>(n, D.norm(), E.norm());
|
||||||
VERIFY((C - C_ref).norm() < tol);
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -326,7 +352,7 @@ void test_gemm_cross_context_resize() {
|
|||||||
Mat C = d_C.toHost();
|
Mat C = d_C.toHost();
|
||||||
Mat C_ref = D * E;
|
Mat C_ref = D * E;
|
||||||
|
|
||||||
RealScalar tol = RealScalar(16) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
RealScalar tol = gemm_error_bound<Scalar>(16, D.norm(), E.norm());
|
||||||
VERIFY((C - C_ref).norm() < tol);
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -353,7 +379,9 @@ void test_gemm_chain(Index n) {
|
|||||||
Mat D = d_D.toHost();
|
Mat D = d_D.toHost();
|
||||||
Mat D_ref = (A * B) * E;
|
Mat D_ref = (A * B) * E;
|
||||||
|
|
||||||
RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * D_ref.norm();
|
Mat C_ref = A * B;
|
||||||
|
RealScalar tol =
|
||||||
|
gemm_error_bound<Scalar>(n, A.norm(), B.norm()) * E.norm() + gemm_error_bound<Scalar>(n, C_ref.norm(), E.norm());
|
||||||
VERIFY((D - D_ref).norm() < tol);
|
VERIFY((D - D_ref).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -401,7 +429,7 @@ void test_llt_solve_expr(Index n, Index nrhs) {
|
|||||||
|
|
||||||
Mat X = d_X.toHost();
|
Mat X = d_X.toHost();
|
||||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- LLT solve with explicit context ----------------------------------------
|
// ---- LLT solve with explicit context ----------------------------------------
|
||||||
@@ -423,7 +451,7 @@ void test_llt_solve_expr_context(Index n, Index nrhs) {
|
|||||||
|
|
||||||
Mat X = d_X.toHost();
|
Mat X = d_X.toHost();
|
||||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------
|
// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------
|
||||||
@@ -444,7 +472,7 @@ void test_lu_solve_expr(Index n, Index nrhs) {
|
|||||||
|
|
||||||
Mat X = d_X.toHost();
|
Mat X = d_X.toHost();
|
||||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------
|
// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------
|
||||||
@@ -474,7 +502,7 @@ void test_gemm_then_solve(Index n) {
|
|||||||
|
|
||||||
Mat X = d_X.toHost();
|
Mat X = d_X.toHost();
|
||||||
RealScalar residual = (C * X - D).norm() / D.norm();
|
RealScalar residual = (C * X - D).norm() / D.norm();
|
||||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- LLT solve with Upper triangle -----------------------------------------
|
// ---- LLT solve with Upper triangle -----------------------------------------
|
||||||
@@ -495,7 +523,7 @@ void test_llt_solve_upper(Index n, Index nrhs) {
|
|||||||
|
|
||||||
Mat X = d_X.toHost();
|
Mat X = d_X.toHost();
|
||||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- LU solve with explicit context -----------------------------------------
|
// ---- LU solve with explicit context -----------------------------------------
|
||||||
@@ -517,7 +545,7 @@ void test_lu_solve_expr_context(Index n, Index nrhs) {
|
|||||||
|
|
||||||
Mat X = d_X.toHost();
|
Mat X = d_X.toHost();
|
||||||
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||||
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- Zero-nrhs solver expressions ------------------------------------------
|
// ---- Zero-nrhs solver expressions ------------------------------------------
|
||||||
@@ -581,7 +609,7 @@ void test_trsm(Index n, Index nrhs) {
|
|||||||
|
|
||||||
Mat X = d_X.toHost();
|
Mat X = d_X.toHost();
|
||||||
RealScalar residual = (A * X - B).norm() / B.norm();
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||||
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- SYMM/HEMM: selfadjointView<UpLo>() * B --------------------------------
|
// ---- SYMM/HEMM: selfadjointView<UpLo>() * B --------------------------------
|
||||||
@@ -603,7 +631,7 @@ void test_symm(Index n, Index nrhs) {
|
|||||||
Mat C = d_C.toHost();
|
Mat C = d_C.toHost();
|
||||||
Mat C_ref = A * B; // A is symmetric, so full multiply == symm
|
Mat C_ref = A * B; // A is symmetric, so full multiply == symm
|
||||||
|
|
||||||
RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm());
|
||||||
VERIFY((C - C_ref).norm() < tol);
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -629,7 +657,7 @@ void test_syrk(Index n, Index k) {
|
|||||||
Mat C_lower = C.template triangularView<Lower>();
|
Mat C_lower = C.template triangularView<Lower>();
|
||||||
Mat C_ref_lower = C_ref.template triangularView<Lower>();
|
Mat C_ref_lower = C_ref.template triangularView<Lower>();
|
||||||
|
|
||||||
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), A.norm());
|
||||||
VERIFY((C_lower - C_ref_lower).norm() < tol);
|
VERIFY((C_lower - C_ref_lower).norm() < tol);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
154
test/gpu_cudss_ldlt.cpp
Normal file
154
test/gpu_cudss_ldlt.cpp
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuSparseLDLT: GPU sparse LDL^T via cuDSS.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Helper: build a random sparse symmetric indefinite matrix ---------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
SparseMatrix<Scalar, ColMajor, int> make_symmetric_indefinite(Index n, double density = 0.1) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
|
||||||
|
// Build a random sparse matrix and symmetrize it.
|
||||||
|
// The diagonal has mixed signs to ensure indefiniteness.
|
||||||
|
SpMat R(n, n);
|
||||||
|
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
|
||||||
|
for (Index j = 0; j < n; ++j) {
|
||||||
|
for (Index i = 0; i < n; ++i) {
|
||||||
|
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
|
||||||
|
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
R.makeCompressed();
|
||||||
|
|
||||||
|
// A = R + R^H (symmetric), then add diagonal with alternating signs for indefiniteness.
|
||||||
|
SpMat A = R + SparseMatrix<Scalar, ColMajor, int>(R.adjoint());
|
||||||
|
for (Index i = 0; i < n; ++i) {
|
||||||
|
Scalar diag_val = Scalar((i % 2 == 0) ? n : -n);
|
||||||
|
A.coeffRef(i, i) += diag_val;
|
||||||
|
}
|
||||||
|
A.makeCompressed();
|
||||||
|
return A;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve and check residual -----------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_solve(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_symmetric_indefinite<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLDLT<Scalar> ldlt(A);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||||
|
|
||||||
|
Vec x = ldlt.solve(b);
|
||||||
|
VERIFY_IS_EQUAL(x.rows(), n);
|
||||||
|
|
||||||
|
Vec r = A * x - b;
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(r.norm() / b.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Multiple RHS -----------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_multiple_rhs(Index n, Index nrhs) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_symmetric_indefinite<Scalar>(n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
GpuSparseLDLT<Scalar> ldlt(A);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||||
|
|
||||||
|
Mat X = ldlt.solve(B);
|
||||||
|
VERIFY_IS_EQUAL(X.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||||
|
|
||||||
|
Mat R = A * X - B;
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(R.norm() / B.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Refactorize ------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_refactorize(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_symmetric_indefinite<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLDLT<Scalar> ldlt;
|
||||||
|
ldlt.analyzePattern(A);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||||
|
|
||||||
|
ldlt.factorize(A);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||||
|
Vec x1 = ldlt.solve(b);
|
||||||
|
|
||||||
|
// Modify values, keep pattern.
|
||||||
|
SpMat A2 = A;
|
||||||
|
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
|
||||||
|
|
||||||
|
ldlt.factorize(A2);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||||
|
Vec x2 = ldlt.solve(b);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((A * x1 - b).norm() / b.norm() < tol);
|
||||||
|
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
|
||||||
|
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty ------------------------------------------------------------------
|
||||||
|
|
||||||
|
void test_empty() {
|
||||||
|
using SpMat = SparseMatrix<double, ColMajor, int>;
|
||||||
|
SpMat A(0, 0);
|
||||||
|
A.makeCompressed();
|
||||||
|
GpuSparseLDLT<double> ldlt(A);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST(test_solve<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_solve<Scalar>(256));
|
||||||
|
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_refactorize<Scalar>(64));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cudss_ldlt) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_empty());
|
||||||
|
}
|
||||||
202
test/gpu_cudss_llt.cpp
Normal file
202
test/gpu_cudss_llt.cpp
Normal file
@@ -0,0 +1,202 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuSparseLLT: GPU sparse Cholesky via cuDSS.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Helper: build a random sparse SPD matrix -------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
// Uses the global std::rand state seeded by the test framework (g_seed).
|
||||||
|
SpMat R(n, n);
|
||||||
|
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
|
||||||
|
for (Index j = 0; j < n; ++j) {
|
||||||
|
for (Index i = 0; i < n; ++i) {
|
||||||
|
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
|
||||||
|
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
R.makeCompressed();
|
||||||
|
|
||||||
|
// A = R^H * R + n * I (guaranteed SPD).
|
||||||
|
SpMat A = R.adjoint() * R;
|
||||||
|
for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
|
||||||
|
A.makeCompressed();
|
||||||
|
return A;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve and check residual -----------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_solve(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_spd<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLLT<Scalar> llt(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
|
||||||
|
Vec x = llt.solve(b);
|
||||||
|
VERIFY_IS_EQUAL(x.rows(), n);
|
||||||
|
|
||||||
|
// Check residual: ||Ax - b|| / ||b||.
|
||||||
|
Vec r = A * x - b;
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(r.norm() / b.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Compare with CPU SimplicialLLT -----------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_vs_cpu(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_spd<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLLT<Scalar> gpu_llt(A);
|
||||||
|
VERIFY_IS_EQUAL(gpu_llt.info(), Success);
|
||||||
|
Vec x_gpu = gpu_llt.solve(b);
|
||||||
|
|
||||||
|
SimplicialLLT<SpMat> cpu_llt(A);
|
||||||
|
VERIFY_IS_EQUAL(cpu_llt.info(), Success);
|
||||||
|
Vec x_cpu = cpu_llt.solve(b);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((x_gpu - x_cpu).norm() / x_cpu.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Multiple RHS -----------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_multiple_rhs(Index n, Index nrhs) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_spd<Scalar>(n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
GpuSparseLLT<Scalar> llt(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
|
||||||
|
Mat X = llt.solve(B);
|
||||||
|
VERIFY_IS_EQUAL(X.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||||
|
|
||||||
|
Mat R = A * X - B;
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(R.norm() / B.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Separate analyze + factorize (refactorization) -------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_refactorize(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_spd<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLLT<Scalar> llt;
|
||||||
|
llt.analyzePattern(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
|
||||||
|
// First factorize + solve.
|
||||||
|
llt.factorize(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
Vec x1 = llt.solve(b);
|
||||||
|
|
||||||
|
// Modify values (keep same pattern): scale diagonal.
|
||||||
|
SpMat A2 = A;
|
||||||
|
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
|
||||||
|
|
||||||
|
// Refactorize with same pattern.
|
||||||
|
llt.factorize(A2);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
Vec x2 = llt.solve(b);
|
||||||
|
|
||||||
|
// Both solutions should satisfy their respective systems.
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((A * x1 - b).norm() / b.norm() < tol);
|
||||||
|
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
|
||||||
|
|
||||||
|
// Solutions should differ (A2 != A).
|
||||||
|
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty matrix -----------------------------------------------------------
|
||||||
|
|
||||||
|
void test_empty() {
|
||||||
|
using SpMat = SparseMatrix<double, ColMajor, int>;
|
||||||
|
SpMat A(0, 0);
|
||||||
|
A.makeCompressed();
|
||||||
|
GpuSparseLLT<double> llt(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
VERIFY_IS_EQUAL(llt.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(llt.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Upper triangle ---------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_upper(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_spd<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLLT<Scalar, Upper> llt(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
|
||||||
|
Vec x = llt.solve(b);
|
||||||
|
Vec r = A * x - b;
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(r.norm() / b.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST(test_solve<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_solve<Scalar>(256));
|
||||||
|
CALL_SUBTEST(test_vs_cpu<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_refactorize<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_upper<Scalar>(64));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cudss_llt) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_empty());
|
||||||
|
}
|
||||||
147
test/gpu_cudss_lu.cpp
Normal file
147
test/gpu_cudss_lu.cpp
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuSparseLU: GPU sparse LU via cuDSS.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Helper: build a random sparse non-singular general matrix ---------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
SparseMatrix<Scalar, ColMajor, int> make_general(Index n, double density = 0.1) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat R(n, n);
|
||||||
|
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
|
||||||
|
for (Index j = 0; j < n; ++j) {
|
||||||
|
for (Index i = 0; i < n; ++i) {
|
||||||
|
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
|
||||||
|
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Add strong diagonal for non-singularity.
|
||||||
|
for (Index i = 0; i < n; ++i) R.coeffRef(i, i) += Scalar(RealScalar(n));
|
||||||
|
R.makeCompressed();
|
||||||
|
return R;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve and check residual -----------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_solve(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_general<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLU<Scalar> lu(A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
|
||||||
|
Vec x = lu.solve(b);
|
||||||
|
VERIFY_IS_EQUAL(x.rows(), n);
|
||||||
|
|
||||||
|
Vec r = A * x - b;
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(r.norm() / b.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Multiple RHS -----------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_multiple_rhs(Index n, Index nrhs) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_general<Scalar>(n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
GpuSparseLU<Scalar> lu(A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
|
||||||
|
Mat X = lu.solve(B);
|
||||||
|
VERIFY_IS_EQUAL(X.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||||
|
|
||||||
|
Mat R = A * X - B;
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(R.norm() / B.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Refactorize ------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_refactorize(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_general<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLU<Scalar> lu;
|
||||||
|
lu.analyzePattern(A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
|
||||||
|
lu.factorize(A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
Vec x1 = lu.solve(b);
|
||||||
|
|
||||||
|
// Modify values, keep pattern.
|
||||||
|
SpMat A2 = A;
|
||||||
|
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
|
||||||
|
|
||||||
|
lu.factorize(A2);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
Vec x2 = lu.solve(b);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((A * x1 - b).norm() / b.norm() < tol);
|
||||||
|
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
|
||||||
|
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty ------------------------------------------------------------------
|
||||||
|
|
||||||
|
void test_empty() {
|
||||||
|
using SpMat = SparseMatrix<double, ColMajor, int>;
|
||||||
|
SpMat A(0, 0);
|
||||||
|
A.makeCompressed();
|
||||||
|
GpuSparseLU<double> lu(A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
VERIFY_IS_EQUAL(lu.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(lu.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST(test_solve<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_solve<Scalar>(256));
|
||||||
|
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_refactorize<Scalar>(64));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cudss_lu) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_empty());
|
||||||
|
}
|
||||||
186
test/gpu_cufft.cpp
Normal file
186
test/gpu_cufft.cpp
Normal file
@@ -0,0 +1,186 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuFFT: GPU FFT via cuFFT.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- 1D C2C roundtrip: inv(fwd(x)) ≈ x -------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_c2c_roundtrip(Index n) {
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using Vec = Matrix<Complex, Dynamic, 1>;
|
||||||
|
using RealScalar = Scalar;
|
||||||
|
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
Vec X = fft.fwd(x);
|
||||||
|
VERIFY_IS_EQUAL(X.size(), n);
|
||||||
|
|
||||||
|
Vec y = fft.inv(X);
|
||||||
|
VERIFY_IS_EQUAL(y.size(), n);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((y - x).norm() / x.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 1D C2C known signal: FFT of constant = delta --------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_c2c_constant() {
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using Vec = Matrix<Complex, Dynamic, 1>;
|
||||||
|
using RealScalar = Scalar;
|
||||||
|
|
||||||
|
const int n = 64;
|
||||||
|
Vec x = Vec::Constant(n, Complex(3.0, 0.0));
|
||||||
|
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
Vec X = fft.fwd(x);
|
||||||
|
|
||||||
|
// FFT of constant c: X[0] = c*n, X[k] = 0 for k > 0.
|
||||||
|
RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(n);
|
||||||
|
VERIFY(std::abs(X(0) - Complex(3.0 * n, 0.0)) < tol);
|
||||||
|
for (int k = 1; k < n; ++k) {
|
||||||
|
VERIFY(std::abs(X(k)) < tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 1D R2C/C2R roundtrip: invReal(fwd(r), n) ≈ r --------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_r2c_roundtrip(Index n) {
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using CVec = Matrix<Complex, Dynamic, 1>;
|
||||||
|
using RVec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = Scalar;
|
||||||
|
|
||||||
|
RVec r = RVec::Random(n);
|
||||||
|
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
CVec R = fft.fwd(r);
|
||||||
|
|
||||||
|
// R2C returns n/2+1 complex values.
|
||||||
|
VERIFY_IS_EQUAL(R.size(), n / 2 + 1);
|
||||||
|
|
||||||
|
RVec s = fft.invReal(R, n);
|
||||||
|
VERIFY_IS_EQUAL(s.size(), n);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((s - r).norm() / r.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 2D C2C roundtrip: inv2d(fwd2d(A)) ≈ A ---------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_2d_roundtrip(Index rows, Index cols) {
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using Mat = Matrix<Complex, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = Scalar;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(rows, cols);
|
||||||
|
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
Mat B = fft.fwd2d(A);
|
||||||
|
VERIFY_IS_EQUAL(B.rows(), rows);
|
||||||
|
VERIFY_IS_EQUAL(B.cols(), cols);
|
||||||
|
|
||||||
|
Mat C = fft.inv2d(B);
|
||||||
|
VERIFY_IS_EQUAL(C.rows(), rows);
|
||||||
|
VERIFY_IS_EQUAL(C.cols(), cols);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(rows * cols) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((C - A).norm() / A.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 2D C2C known signal: constant matrix -----------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_2d_constant() {
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using Mat = Matrix<Complex, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = Scalar;
|
||||||
|
|
||||||
|
const int rows = 16, cols = 32;
|
||||||
|
Mat A = Mat::Constant(rows, cols, Complex(2.0, 0.0));
|
||||||
|
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
Mat B = fft.fwd2d(A);
|
||||||
|
|
||||||
|
// 2D FFT of constant c: B(0,0) = c*rows*cols, all others = 0.
|
||||||
|
RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(rows * cols);
|
||||||
|
VERIFY(std::abs(B(0, 0) - Complex(2.0 * rows * cols, 0.0)) < tol);
|
||||||
|
for (int j = 0; j < cols; ++j) {
|
||||||
|
for (int i = 0; i < rows; ++i) {
|
||||||
|
if (i == 0 && j == 0) continue;
|
||||||
|
VERIFY(std::abs(B(i, j)) < tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Plan reuse: repeated calls should work ---------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_plan_reuse() {
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using Vec = Matrix<Complex, Dynamic, 1>;
|
||||||
|
using RealScalar = Scalar;
|
||||||
|
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
for (int trial = 0; trial < 5; ++trial) {
|
||||||
|
Vec x = Vec::Random(128);
|
||||||
|
Vec X = fft.fwd(x);
|
||||||
|
Vec y = fft.inv(X);
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(128) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((y - x).norm() / x.norm() < tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty ------------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_empty() {
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using Vec = Matrix<Complex, Dynamic, 1>;
|
||||||
|
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
Vec x(0);
|
||||||
|
Vec X = fft.fwd(x);
|
||||||
|
VERIFY_IS_EQUAL(X.size(), 0);
|
||||||
|
Vec y = fft.inv(X);
|
||||||
|
VERIFY_IS_EQUAL(y.size(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(256));
|
||||||
|
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(1000)); // non-power-of-2
|
||||||
|
CALL_SUBTEST(test_c2c_constant<Scalar>());
|
||||||
|
CALL_SUBTEST(test_r2c_roundtrip<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_r2c_roundtrip<Scalar>(256));
|
||||||
|
CALL_SUBTEST(test_2d_roundtrip<Scalar>(32, 32));
|
||||||
|
CALL_SUBTEST(test_2d_roundtrip<Scalar>(16, 64)); // non-square
|
||||||
|
CALL_SUBTEST(test_2d_constant<Scalar>());
|
||||||
|
CALL_SUBTEST(test_plan_reuse<Scalar>());
|
||||||
|
CALL_SUBTEST(test_empty<Scalar>());
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cufft) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
}
|
||||||
180
test/gpu_cusolver_eigen.cpp
Normal file
180
test/gpu_cusolver_eigen.cpp
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuSelfAdjointEigenSolver: GPU symmetric/Hermitian eigenvalue
|
||||||
|
// decomposition via cuSOLVER.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/Eigenvalues>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Reconstruction: V * diag(W) * V^H ≈ A ---------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_eigen_reconstruction(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
// Build a symmetric/Hermitian matrix.
|
||||||
|
Mat R = Mat::Random(n, n);
|
||||||
|
Mat A = R + R.adjoint();
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver<Scalar> es(A);
|
||||||
|
VERIFY_IS_EQUAL(es.info(), Success);
|
||||||
|
|
||||||
|
auto W = es.eigenvalues();
|
||||||
|
Mat V = es.eigenvectors();
|
||||||
|
|
||||||
|
VERIFY_IS_EQUAL(W.size(), n);
|
||||||
|
VERIFY_IS_EQUAL(V.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(V.cols(), n);
|
||||||
|
|
||||||
|
// Reconstruct: A_hat = V * diag(W) * V^H.
|
||||||
|
Mat A_hat = V * W.asDiagonal() * V.adjoint();
|
||||||
|
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||||
|
VERIFY((A_hat - A).norm() < tol);
|
||||||
|
|
||||||
|
// Orthogonality: V^H * V ≈ I.
|
||||||
|
Mat VhV = V.adjoint() * V;
|
||||||
|
Mat eye = Mat::Identity(n, n);
|
||||||
|
VERIFY((VhV - eye).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Eigenvalues match CPU SelfAdjointEigenSolver ---------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_eigen_values(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat R = Mat::Random(n, n);
|
||||||
|
Mat A = R + R.adjoint();
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver<Scalar> gpu_es(A);
|
||||||
|
VERIFY_IS_EQUAL(gpu_es.info(), Success);
|
||||||
|
auto W_gpu = gpu_es.eigenvalues();
|
||||||
|
|
||||||
|
SelfAdjointEigenSolver<Mat> cpu_es(A);
|
||||||
|
auto W_cpu = cpu_es.eigenvalues();
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() *
|
||||||
|
W_cpu.cwiseAbs().maxCoeff();
|
||||||
|
VERIFY((W_gpu - W_cpu).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Eigenvalues-only mode --------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_eigen_values_only(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat R = Mat::Random(n, n);
|
||||||
|
Mat A = R + R.adjoint();
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver<Scalar> gpu_es(A, GpuSelfAdjointEigenSolver<Scalar>::EigenvaluesOnly);
|
||||||
|
VERIFY_IS_EQUAL(gpu_es.info(), Success);
|
||||||
|
auto W_gpu = gpu_es.eigenvalues();
|
||||||
|
|
||||||
|
SelfAdjointEigenSolver<Mat> cpu_es(A, EigenvaluesOnly);
|
||||||
|
auto W_cpu = cpu_es.eigenvalues();
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() *
|
||||||
|
W_cpu.cwiseAbs().maxCoeff();
|
||||||
|
VERIFY((W_gpu - W_cpu).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- DeviceMatrix input path ------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_eigen_device_matrix(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat R = Mat::Random(n, n);
|
||||||
|
Mat A = R + R.adjoint();
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
GpuSelfAdjointEigenSolver<Scalar> es;
|
||||||
|
es.compute(d_A);
|
||||||
|
VERIFY_IS_EQUAL(es.info(), Success);
|
||||||
|
|
||||||
|
auto W_gpu = es.eigenvalues();
|
||||||
|
Mat V = es.eigenvectors();
|
||||||
|
|
||||||
|
// Verify reconstruction.
|
||||||
|
Mat A_hat = V * W_gpu.asDiagonal() * V.adjoint();
|
||||||
|
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||||
|
VERIFY((A_hat - A).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Recompute (reuse solver object) ----------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_eigen_recompute(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver<Scalar> es;
|
||||||
|
|
||||||
|
for (int trial = 0; trial < 3; ++trial) {
|
||||||
|
Mat R = Mat::Random(n, n);
|
||||||
|
Mat A = R + R.adjoint();
|
||||||
|
es.compute(A);
|
||||||
|
VERIFY_IS_EQUAL(es.info(), Success);
|
||||||
|
|
||||||
|
auto W = es.eigenvalues();
|
||||||
|
Mat V = es.eigenvectors();
|
||||||
|
Mat A_hat = V * W.asDiagonal() * V.adjoint();
|
||||||
|
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||||
|
VERIFY((A_hat - A).norm() < tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty matrix -----------------------------------------------------------
|
||||||
|
|
||||||
|
void test_eigen_empty() {
|
||||||
|
GpuSelfAdjointEigenSolver<double> es(MatrixXd(0, 0));
|
||||||
|
VERIFY_IS_EQUAL(es.info(), Success);
|
||||||
|
VERIFY_IS_EQUAL(es.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(es.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
// Reconstruction + orthogonality.
|
||||||
|
CALL_SUBTEST(test_eigen_reconstruction<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_eigen_reconstruction<Scalar>(128));
|
||||||
|
|
||||||
|
// Eigenvalues match CPU.
|
||||||
|
CALL_SUBTEST(test_eigen_values<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_eigen_values<Scalar>(128));
|
||||||
|
|
||||||
|
// Values-only mode.
|
||||||
|
CALL_SUBTEST(test_eigen_values_only<Scalar>(64));
|
||||||
|
|
||||||
|
// DeviceMatrix input.
|
||||||
|
CALL_SUBTEST(test_eigen_device_matrix<Scalar>(64));
|
||||||
|
|
||||||
|
// Recompute.
|
||||||
|
CALL_SUBTEST(test_eigen_recompute<Scalar>(32));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cusolver_eigen) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_eigen_empty());
|
||||||
|
}
|
||||||
185
test/gpu_cusolver_qr.cpp
Normal file
185
test/gpu_cusolver_qr.cpp
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuQR: GPU QR decomposition via cuSOLVER.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/QR>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Solve square system: A * X = B -----------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_qr_solve_square(Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
GpuQR<Scalar> qr(A);
|
||||||
|
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||||
|
|
||||||
|
Mat X = qr.solve(B);
|
||||||
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||||
|
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve overdetermined system: m > n (least-squares) ---------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_qr_solve_overdetermined(Index m, Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
eigen_assert(m >= n);
|
||||||
|
Mat A = Mat::Random(m, n);
|
||||||
|
Mat B = Mat::Random(m, nrhs);
|
||||||
|
|
||||||
|
GpuQR<Scalar> qr(A);
|
||||||
|
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||||
|
|
||||||
|
Mat X = qr.solve(B);
|
||||||
|
VERIFY_IS_EQUAL(X.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||||
|
|
||||||
|
// Compare with CPU QR.
|
||||||
|
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(m) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve with DeviceMatrix input ------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_qr_solve_device(Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
GpuQR<Scalar> qr;
|
||||||
|
qr.compute(d_A);
|
||||||
|
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_X = qr.solve(d_B);
|
||||||
|
Mat X = d_X.toHost();
|
||||||
|
|
||||||
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||||
|
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve overdetermined via device path -----------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_qr_solve_overdetermined_device(Index m, Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
eigen_assert(m >= n);
|
||||||
|
Mat A = Mat::Random(m, n);
|
||||||
|
Mat B = Mat::Random(m, nrhs);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
GpuQR<Scalar> qr;
|
||||||
|
qr.compute(d_A);
|
||||||
|
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_X = qr.solve(d_B);
|
||||||
|
VERIFY_IS_EQUAL(d_X.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(d_X.cols(), nrhs);
|
||||||
|
|
||||||
|
Mat X = d_X.toHost();
|
||||||
|
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(m) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Multiple solves reuse the factorization --------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_qr_multiple_solves(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
GpuQR<Scalar> qr(A);
|
||||||
|
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
for (int k = 0; k < 5; ++k) {
|
||||||
|
Mat B = Mat::Random(n, 3);
|
||||||
|
Mat X = qr.solve(B);
|
||||||
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||||
|
VERIFY(residual < tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Agreement with CPU HouseholderQR ---------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_qr_vs_cpu(Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
GpuQR<Scalar> gpu_qr(A);
|
||||||
|
VERIFY_IS_EQUAL(gpu_qr.info(), Success);
|
||||||
|
|
||||||
|
Mat X_gpu = gpu_qr.solve(B);
|
||||||
|
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST(test_qr_solve_square<Scalar>(1, 1));
|
||||||
|
CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 1));
|
||||||
|
CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_qr_solve_square<Scalar>(256, 8));
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(128, 64, 4));
|
||||||
|
CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(256, 128, 1));
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_qr_solve_device<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_qr_solve_overdetermined_device<Scalar>(128, 64, 4));
|
||||||
|
CALL_SUBTEST(test_qr_multiple_solves<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_qr_vs_cpu<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_qr_vs_cpu<Scalar>(256, 8));
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_qr_empty() {
|
||||||
|
GpuQR<double> qr(MatrixXd(0, 0));
|
||||||
|
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||||
|
VERIFY_IS_EQUAL(qr.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(qr.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cusolver_qr) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_qr_empty());
|
||||||
|
}
|
||||||
194
test/gpu_cusolver_svd.cpp
Normal file
194
test/gpu_cusolver_svd.cpp
Normal file
@@ -0,0 +1,194 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuSVD: GPU SVD via cuSOLVER.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/SVD>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- SVD reconstruction: U * diag(S) * VT ≈ A ------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar, unsigned int Options>
|
||||||
|
void test_svd_reconstruction(Index m, Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, n);
|
||||||
|
GpuSVD<Scalar> svd(A, Options);
|
||||||
|
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||||
|
|
||||||
|
auto S = svd.singularValues();
|
||||||
|
Mat U = svd.matrixU();
|
||||||
|
Mat VT = svd.matrixVT();
|
||||||
|
|
||||||
|
const Index k = (std::min)(m, n);
|
||||||
|
|
||||||
|
// Reconstruct: A_hat = U[:,:k] * diag(S) * VT[:k,:].
|
||||||
|
Mat A_hat = U.leftCols(k) * S.asDiagonal() * VT.topRows(k);
|
||||||
|
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(k)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||||
|
VERIFY((A_hat - A).norm() < tol);
|
||||||
|
|
||||||
|
// Orthogonality: U^H * U ≈ I.
|
||||||
|
Mat UtU = U.adjoint() * U;
|
||||||
|
Mat I_u = Mat::Identity(U.cols(), U.cols());
|
||||||
|
VERIFY((UtU - I_u).norm() < tol);
|
||||||
|
|
||||||
|
// Orthogonality: VT * VT^H ≈ I.
|
||||||
|
Mat VtVh = VT * VT.adjoint();
|
||||||
|
Mat I_v = Mat::Identity(VT.rows(), VT.rows());
|
||||||
|
VERIFY((VtVh - I_v).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Singular values match CPU BDCSVD ---------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_svd_singular_values(Index m, Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, n);
|
||||||
|
GpuSVD<Scalar> svd(A, 0); // values only
|
||||||
|
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||||
|
|
||||||
|
auto S_gpu = svd.singularValues();
|
||||||
|
auto S_cpu = BDCSVD<Mat>(A, 0).singularValues();
|
||||||
|
|
||||||
|
RealScalar tol =
|
||||||
|
RealScalar(5) * std::sqrt(static_cast<RealScalar>((std::min)(m, n))) * NumTraits<Scalar>::epsilon() * S_cpu(0);
|
||||||
|
VERIFY((S_gpu - S_cpu).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve: pseudoinverse ---------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_svd_solve(Index m, Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, n);
|
||||||
|
Mat B = Mat::Random(m, nrhs);
|
||||||
|
|
||||||
|
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
|
||||||
|
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||||
|
|
||||||
|
Mat X = svd.solve(B);
|
||||||
|
VERIFY_IS_EQUAL(X.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||||
|
|
||||||
|
// Compare with CPU BDCSVD solve.
|
||||||
|
Mat X_cpu = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV).solve(B);
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar((std::max)(m, n)) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve: truncated -------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_svd_solve_truncated(Index m, Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, n);
|
||||||
|
Mat B = Mat::Random(m, 1);
|
||||||
|
const Index k = (std::min)(m, n);
|
||||||
|
const Index trunc = k / 2;
|
||||||
|
eigen_assert(trunc > 0);
|
||||||
|
|
||||||
|
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
|
||||||
|
Mat X_trunc = svd.solve(B, trunc);
|
||||||
|
|
||||||
|
// Build CPU reference: truncated pseudoinverse.
|
||||||
|
auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
|
||||||
|
auto S = cpu_svd.singularValues();
|
||||||
|
Mat U = cpu_svd.matrixU();
|
||||||
|
Mat V = cpu_svd.matrixV();
|
||||||
|
|
||||||
|
// D_ii = 1/S_i for i < trunc, 0 otherwise.
|
||||||
|
Matrix<RealScalar, Dynamic, 1> D = Matrix<RealScalar, Dynamic, 1>::Zero(k);
|
||||||
|
for (Index i = 0; i < trunc; ++i) D(i) = RealScalar(1) / S(i);
|
||||||
|
Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((X_trunc - X_ref).norm() / X_ref.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve: Tikhonov regularized --------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_svd_solve_regularized(Index m, Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, n);
|
||||||
|
Mat B = Mat::Random(m, 1);
|
||||||
|
RealScalar lambda = RealScalar(0.1);
|
||||||
|
const Index k = (std::min)(m, n);
|
||||||
|
|
||||||
|
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
|
||||||
|
Mat X_reg = svd.solve(B, lambda);
|
||||||
|
|
||||||
|
// CPU reference: D_ii = S_i / (S_i^2 + lambda^2).
|
||||||
|
auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
|
||||||
|
auto S = cpu_svd.singularValues();
|
||||||
|
Mat U = cpu_svd.matrixU();
|
||||||
|
Mat V = cpu_svd.matrixV();
|
||||||
|
|
||||||
|
Matrix<RealScalar, Dynamic, 1> D(k);
|
||||||
|
for (Index i = 0; i < k; ++i) D(i) = S(i) / (S(i) * S(i) + lambda * lambda);
|
||||||
|
Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((X_reg - X_ref).norm() / X_ref.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty matrix -----------------------------------------------------------
|
||||||
|
|
||||||
|
void test_svd_empty() {
|
||||||
|
GpuSVD<double> svd(MatrixXd(0, 0), 0);
|
||||||
|
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||||
|
VERIFY_IS_EQUAL(svd.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(svd.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
// Reconstruction + orthogonality (thin and full, identical test logic).
|
||||||
|
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 64)));
|
||||||
|
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(128, 64)));
|
||||||
|
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 128))); // wide (m < n)
|
||||||
|
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(64, 64)));
|
||||||
|
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(128, 64)));
|
||||||
|
|
||||||
|
// Singular values.
|
||||||
|
CALL_SUBTEST(test_svd_singular_values<Scalar>(64, 64));
|
||||||
|
CALL_SUBTEST(test_svd_singular_values<Scalar>(128, 64));
|
||||||
|
|
||||||
|
// Solve.
|
||||||
|
CALL_SUBTEST(test_svd_solve<Scalar>(64, 64, 4));
|
||||||
|
CALL_SUBTEST(test_svd_solve<Scalar>(128, 64, 4));
|
||||||
|
CALL_SUBTEST(test_svd_solve<Scalar>(64, 128, 4)); // wide (m < n)
|
||||||
|
|
||||||
|
// Truncated and regularized solve.
|
||||||
|
CALL_SUBTEST(test_svd_solve_truncated<Scalar>(64, 64));
|
||||||
|
CALL_SUBTEST(test_svd_solve_regularized<Scalar>(64, 64));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cusolver_svd) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_svd_empty());
|
||||||
|
}
|
||||||
203
test/gpu_cusparse_spmv.cpp
Normal file
203
test/gpu_cusparse_spmv.cpp
Normal file
@@ -0,0 +1,203 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuSparseContext: GPU SpMV/SpMM via cuSPARSE.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Helper: build a random sparse matrix -----------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
SparseMatrix<Scalar, ColMajor, int> make_sparse(Index rows, Index cols, double density = 0.1) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat R(rows, cols);
|
||||||
|
R.reserve(VectorXi::Constant(cols, static_cast<int>(rows * density) + 1));
|
||||||
|
for (Index j = 0; j < cols; ++j) {
|
||||||
|
for (Index i = 0; i < rows; ++i) {
|
||||||
|
if ((std::rand() / double(RAND_MAX)) < density) {
|
||||||
|
R.insert(i, j) = Scalar(RealScalar(std::rand() / double(RAND_MAX) - 0.5));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
R.makeCompressed();
|
||||||
|
return R;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMV: y = A * x -------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_spmv(Index rows, Index cols) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_sparse<Scalar>(rows, cols);
|
||||||
|
Vec x = Vec::Random(cols);
|
||||||
|
|
||||||
|
GpuSparseContext<Scalar> ctx;
|
||||||
|
Vec y_gpu = ctx.multiply(A, x);
|
||||||
|
Vec y_cpu = A * x;
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY_IS_EQUAL(y_gpu.size(), rows);
|
||||||
|
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMV with alpha/beta: y = alpha*A*x + beta*y ---------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_spmv_alpha_beta(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_sparse<Scalar>(n, n);
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
Vec y_init = Vec::Random(n);
|
||||||
|
|
||||||
|
Scalar alpha(2);
|
||||||
|
Scalar beta(3);
|
||||||
|
|
||||||
|
Vec y_cpu = alpha * (A * x) + beta * y_init;
|
||||||
|
|
||||||
|
GpuSparseContext<Scalar> ctx;
|
||||||
|
Vec y_gpu = y_init;
|
||||||
|
ctx.multiply(A, x, y_gpu, alpha, beta);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Transpose: y = A^T * x ------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_spmv_transpose(Index rows, Index cols) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_sparse<Scalar>(rows, cols);
|
||||||
|
Vec x = Vec::Random(rows);
|
||||||
|
|
||||||
|
GpuSparseContext<Scalar> ctx;
|
||||||
|
Vec y_gpu = ctx.multiplyT(A, x);
|
||||||
|
Vec y_cpu = A.transpose() * x;
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY_IS_EQUAL(y_gpu.size(), cols);
|
||||||
|
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMM: Y = A * X (multiple RHS) ----------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_spmm(Index rows, Index cols, Index nrhs) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_sparse<Scalar>(rows, cols);
|
||||||
|
Mat X = Mat::Random(cols, nrhs);
|
||||||
|
|
||||||
|
GpuSparseContext<Scalar> ctx;
|
||||||
|
Mat Y_gpu = ctx.multiplyMat(A, X);
|
||||||
|
Mat Y_cpu = A * X;
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY_IS_EQUAL(Y_gpu.rows(), rows);
|
||||||
|
VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);
|
||||||
|
VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Identity matrix: I * x = x --------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_identity(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
// Build sparse identity.
|
||||||
|
SpMat eye(n, n);
|
||||||
|
eye.setIdentity();
|
||||||
|
eye.makeCompressed();
|
||||||
|
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseContext<Scalar> ctx;
|
||||||
|
Vec y = ctx.multiply(eye, x);
|
||||||
|
|
||||||
|
RealScalar tol = NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((y - x).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Context reuse ----------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_reuse(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
GpuSparseContext<Scalar> ctx;
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
|
||||||
|
for (int trial = 0; trial < 3; ++trial) {
|
||||||
|
SpMat A = make_sparse<Scalar>(n, n);
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
Vec y_gpu = ctx.multiply(A, x);
|
||||||
|
Vec y_cpu = A * x;
|
||||||
|
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty ------------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_empty() {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
|
||||||
|
SpMat A(0, 0);
|
||||||
|
A.makeCompressed();
|
||||||
|
Vec x(0);
|
||||||
|
|
||||||
|
GpuSparseContext<Scalar> ctx;
|
||||||
|
Vec y = ctx.multiply(A, x);
|
||||||
|
VERIFY_IS_EQUAL(y.size(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST(test_spmv<Scalar>(64, 64));
|
||||||
|
CALL_SUBTEST(test_spmv<Scalar>(128, 64)); // non-square
|
||||||
|
CALL_SUBTEST(test_spmv<Scalar>(64, 128)); // wide
|
||||||
|
CALL_SUBTEST(test_spmv_alpha_beta<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_spmv_transpose<Scalar>(128, 64));
|
||||||
|
CALL_SUBTEST(test_spmm<Scalar>(64, 64, 4));
|
||||||
|
CALL_SUBTEST(test_identity<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_reuse<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_empty<Scalar>());
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
}
|
||||||
@@ -35,7 +35,6 @@ void test_allocate(Index rows, Index cols) {
|
|||||||
VERIFY(!dm.empty());
|
VERIFY(!dm.empty());
|
||||||
VERIFY_IS_EQUAL(dm.rows(), rows);
|
VERIFY_IS_EQUAL(dm.rows(), rows);
|
||||||
VERIFY_IS_EQUAL(dm.cols(), cols);
|
VERIFY_IS_EQUAL(dm.cols(), cols);
|
||||||
VERIFY_IS_EQUAL(dm.outerStride(), rows);
|
|
||||||
VERIFY(dm.data() != nullptr);
|
VERIFY(dm.data() != nullptr);
|
||||||
VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar));
|
VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar));
|
||||||
}
|
}
|
||||||
@@ -69,7 +68,7 @@ void test_roundtrip_async(Index rows, Index cols) {
|
|||||||
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream));
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream));
|
||||||
|
|
||||||
// Async upload from raw pointer.
|
// Async upload from raw pointer.
|
||||||
auto dm = DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, stream);
|
auto dm = DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, stream);
|
||||||
VERIFY_IS_EQUAL(dm.rows(), rows);
|
VERIFY_IS_EQUAL(dm.rows(), rows);
|
||||||
VERIFY_IS_EQUAL(dm.cols(), cols);
|
VERIFY_IS_EQUAL(dm.cols(), cols);
|
||||||
|
|
||||||
@@ -185,7 +184,6 @@ void test_resize() {
|
|||||||
dm.resize(50, 30);
|
dm.resize(50, 30);
|
||||||
VERIFY_IS_EQUAL(dm.rows(), 50);
|
VERIFY_IS_EQUAL(dm.rows(), 50);
|
||||||
VERIFY_IS_EQUAL(dm.cols(), 30);
|
VERIFY_IS_EQUAL(dm.cols(), 30);
|
||||||
VERIFY_IS_EQUAL(dm.outerStride(), 50);
|
|
||||||
VERIFY(dm.data() != nullptr);
|
VERIFY(dm.data() != nullptr);
|
||||||
|
|
||||||
// Resize to same dimensions is a no-op.
|
// Resize to same dimensions is a no-op.
|
||||||
|
|||||||
Reference in New Issue
Block a user