mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Compare commits
4 Commits
gpu-modern
...
gpu-cg-int
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
014f12f11a | ||
|
|
43a95b62bb | ||
|
|
8593c7f5a1 | ||
|
|
58c44ef36d |
@@ -1,4 +1,4 @@
|
|||||||
cmake_minimum_required(VERSION 3.10.0)
|
cmake_minimum_required(VERSION 3.17)
|
||||||
|
|
||||||
#==============================================================================
|
#==============================================================================
|
||||||
# CMake Policy issues.
|
# CMake Policy issues.
|
||||||
@@ -9,7 +9,7 @@ if (POLICY CMP0077)
|
|||||||
endif (POLICY CMP0077)
|
endif (POLICY CMP0077)
|
||||||
|
|
||||||
# NOTE Remove setting the policy once the minimum required CMake version is
|
# NOTE Remove setting the policy once the minimum required CMake version is
|
||||||
# increased to at least 3.15. Retain enabling the export to package registry.
|
# increased to at least 3.21. Retain enabling the export to package registry.
|
||||||
if (POLICY CMP0090)
|
if (POLICY CMP0090)
|
||||||
# The export command does not populate package registry by default
|
# The export command does not populate package registry by default
|
||||||
cmake_policy (SET CMP0090 NEW)
|
cmake_policy (SET CMP0090 NEW)
|
||||||
|
|||||||
72
Eigen/GPU
Normal file
72
Eigen/GPU
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_MODULE_H
|
||||||
|
#define EIGEN_GPU_MODULE_H
|
||||||
|
|
||||||
|
#include "Core"
|
||||||
|
|
||||||
|
#include "src/Core/util/DisableStupidWarnings.h"
|
||||||
|
|
||||||
|
/** \defgroup GPU_Module GPU module
|
||||||
|
*
|
||||||
|
* GPU-accelerated solvers and operations using NVIDIA CUDA libraries
|
||||||
|
* (cuSOLVER, cuBLAS, cuSPARSE, cuFFT, cuDSS).
|
||||||
|
*
|
||||||
|
* This module provides explicit GPU solver classes that coexist with Eigen's
|
||||||
|
* CPU solvers. Unlike the LAPACKE dispatch (which replaces the CPU
|
||||||
|
* implementation globally), GPU classes are separate types the user
|
||||||
|
* instantiates by choice:
|
||||||
|
*
|
||||||
|
* \code
|
||||||
|
* #define EIGEN_USE_GPU
|
||||||
|
* #include <Eigen/GPU>
|
||||||
|
*
|
||||||
|
* // CPU path (unchanged)
|
||||||
|
* Eigen::LLT<Eigen::MatrixXd> llt_cpu(A);
|
||||||
|
*
|
||||||
|
* // GPU path (explicit)
|
||||||
|
* Eigen::GpuLLT<double> llt_gpu(A); // L stays on device
|
||||||
|
* auto X = llt_gpu.solve(B); // only B transferred per solve
|
||||||
|
* \endcode
|
||||||
|
*
|
||||||
|
* Requires CUDA 11.4+. See CLAUDE.md.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef EIGEN_USE_GPU
|
||||||
|
// IWYU pragma: begin_exports
|
||||||
|
#include "src/GPU/DeviceScalar.h"
|
||||||
|
#include "src/GPU/DeviceMatrix.h"
|
||||||
|
#include "src/GPU/GpuContext.h"
|
||||||
|
#include "src/GPU/DeviceExpr.h"
|
||||||
|
#include "src/GPU/DeviceBlasExpr.h"
|
||||||
|
#include "src/GPU/DeviceSolverExpr.h"
|
||||||
|
#include "src/GPU/DeviceDispatch.h"
|
||||||
|
#include "src/GPU/GpuLLT.h"
|
||||||
|
#include "src/GPU/GpuLU.h"
|
||||||
|
#include "src/GPU/GpuQR.h"
|
||||||
|
#include "src/GPU/GpuSVD.h"
|
||||||
|
#include "src/GPU/GpuEigenSolver.h"
|
||||||
|
#include "src/GPU/CuFftSupport.h"
|
||||||
|
#include "src/GPU/GpuFFT.h"
|
||||||
|
#include "src/GPU/CuSparseSupport.h"
|
||||||
|
#ifdef EIGEN_SPARSECORE_MODULE_H
|
||||||
|
#include "src/GPU/GpuSparseContext.h"
|
||||||
|
#endif
|
||||||
|
#if defined(EIGEN_CUDSS) && defined(EIGEN_SPARSECORE_MODULE_H)
|
||||||
|
#include "src/GPU/CuDssSupport.h"
|
||||||
|
#include "src/GPU/GpuSparseSolverBase.h"
|
||||||
|
#include "src/GPU/GpuSparseLLT.h"
|
||||||
|
#include "src/GPU/GpuSparseLDLT.h"
|
||||||
|
#include "src/GPU/GpuSparseLU.h"
|
||||||
|
#endif
|
||||||
|
// IWYU pragma: end_exports
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "src/Core/util/ReenableStupidWarnings.h"
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_MODULE_H
|
||||||
427
Eigen/src/GPU/CuBlasSupport.h
Normal file
427
Eigen/src/GPU/CuBlasSupport.h
Normal file
@@ -0,0 +1,427 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// cuBLAS-specific support types:
|
||||||
|
// - Error-checking macro
|
||||||
|
// - Operation enum and mapping to cublasOperation_t
|
||||||
|
//
|
||||||
|
// Generic CUDA runtime utilities (DeviceBuffer, cuda_data_type) are in GpuSupport.h.
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_CUBLAS_SUPPORT_H
|
||||||
|
#define EIGEN_GPU_CUBLAS_SUPPORT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSupport.h"
|
||||||
|
#include <cublas_v2.h>
|
||||||
|
#include <cublasLt.h>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// ---- Error-checking macro ---------------------------------------------------
|
||||||
|
|
||||||
|
#define EIGEN_CUBLAS_CHECK(expr) \
|
||||||
|
do { \
|
||||||
|
cublasStatus_t _s = (expr); \
|
||||||
|
eigen_assert(_s == CUBLAS_STATUS_SUCCESS && "cuBLAS call failed"); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
// ---- Operation enum ---------------------------------------------------------
|
||||||
|
// Maps transpose/adjoint flags to cublasOperation_t.
|
||||||
|
|
||||||
|
enum class GpuOp { NoTrans, Trans, ConjTrans };
|
||||||
|
|
||||||
|
constexpr cublasOperation_t to_cublas_op(GpuOp op) {
|
||||||
|
switch (op) {
|
||||||
|
case GpuOp::Trans:
|
||||||
|
return CUBLAS_OP_T;
|
||||||
|
case GpuOp::ConjTrans:
|
||||||
|
return CUBLAS_OP_C;
|
||||||
|
default:
|
||||||
|
return CUBLAS_OP_N;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Scalar → cublasComputeType_t -------------------------------------------
|
||||||
|
// cublasLtMatmul requires a compute type (separate from the data type).
|
||||||
|
//
|
||||||
|
// Precision policy:
|
||||||
|
// - Default: tensor core algorithms enabled via cublasLtMatmul heuristics.
|
||||||
|
// For double, cuBLAS may use Ozaki emulation on sm_80+ tensor cores.
|
||||||
|
// - EIGEN_CUDA_TF32: opt-in to TF32 for float (~2x faster, 10-bit mantissa).
|
||||||
|
// - EIGEN_NO_CUDA_TENSOR_OPS: disables all tensor core usage. Uses pedantic
|
||||||
|
// compute types. For bit-exact reproducibility.
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct cuda_compute_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cuda_compute_type<float> {
|
||||||
|
#if defined(EIGEN_NO_CUDA_TENSOR_OPS)
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_PEDANTIC;
|
||||||
|
#elif defined(EIGEN_CUDA_TF32)
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_FAST_TF32;
|
||||||
|
#else
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cuda_compute_type<double> {
|
||||||
|
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F_PEDANTIC;
|
||||||
|
#else
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cuda_compute_type<std::complex<float>> {
|
||||||
|
#if defined(EIGEN_NO_CUDA_TENSOR_OPS)
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_PEDANTIC;
|
||||||
|
#elif defined(EIGEN_CUDA_TF32)
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F_FAST_TF32;
|
||||||
|
#else
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_32F;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cuda_compute_type<std::complex<double>> {
|
||||||
|
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F_PEDANTIC;
|
||||||
|
#else
|
||||||
|
static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
// ---- Alpha/beta scalar type for cublasLtMatmul ------------------------------
|
||||||
|
// For standard types, alpha/beta match the scalar type.
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct cuda_gemm_scalar {
|
||||||
|
using type = Scalar;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- cublasLt GEMM dispatch -------------------------------------------------
|
||||||
|
// Wraps cublasLtMatmul with descriptor setup, heuristic algorithm selection,
|
||||||
|
// and lazy workspace management. Supports 64-bit dimensions natively.
|
||||||
|
//
|
||||||
|
// The workspace buffer (DeviceBuffer*) is grown lazily to match the selected
|
||||||
|
// algorithm's actual requirement. The heuristic is queried with a generous
|
||||||
|
// 32 MB cap so that the best algorithm is never excluded. Growth is monotonic:
|
||||||
|
// the buffer only grows, never shrinks, so reallocation happens at most a few
|
||||||
|
// times during the lifetime of the owning GpuContext or solver.
|
||||||
|
//
|
||||||
|
// EIGEN_NO_CUDA_TENSOR_OPS: pedantic compute types (CUBLAS_COMPUTE_32F_PEDANTIC,
|
||||||
|
// CUBLAS_COMPUTE_64F_PEDANTIC) prevent cublasLt from selecting tensor core
|
||||||
|
// algorithms, matching the previous cublasGemmEx behavior.
|
||||||
|
//
|
||||||
|
// Thread safety: the workspace buffer is not thread-safe. All GEMM calls
|
||||||
|
// sharing a workspace must be on the same CUDA stream (guaranteed by GpuContext's
|
||||||
|
// single-stream design and by each GpuSVD owning its own stream).
|
||||||
|
//
|
||||||
|
// Future optimization: for hot loops (e.g., CG iteration), caching descriptors
|
||||||
|
// and the selected algorithm by (m, n, k, dtype, transA, transB) would avoid
|
||||||
|
// per-call descriptor creation and heuristic lookup overhead.
|
||||||
|
|
||||||
|
#define EIGEN_CUBLASLT_CHECK(expr) \
|
||||||
|
do { \
|
||||||
|
cublasStatus_t _s = (expr); \
|
||||||
|
eigen_assert(_s == CUBLAS_STATUS_SUCCESS && "cuBLASLt call failed"); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
// Maximum workspace the heuristic is allowed to consider. This is a preference
|
||||||
|
// ceiling, not an allocation — actual allocation matches the selected algorithm.
|
||||||
|
static constexpr size_t kCublasLtMaxWorkspaceBytes = 32 * 1024 * 1024; // 32 MB
|
||||||
|
|
||||||
|
// cublasGemmEx fallback algorithm hint (used when cublasLt heuristic returns no results).
|
||||||
|
constexpr cublasGemmAlgo_t cuda_gemm_algo() {
|
||||||
|
#ifdef EIGEN_NO_CUDA_TENSOR_OPS
|
||||||
|
return CUBLAS_GEMM_DEFAULT;
|
||||||
|
#else
|
||||||
|
return CUBLAS_GEMM_DEFAULT_TENSOR_OP;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void cublaslt_gemm(cublasLtHandle_t lt_handle, cublasHandle_t cublas_handle, cublasOperation_t transA,
|
||||||
|
cublasOperation_t transB, int64_t m, int64_t n, int64_t k,
|
||||||
|
const typename cuda_gemm_scalar<Scalar>::type* alpha, const Scalar* A, int64_t lda, const Scalar* B,
|
||||||
|
int64_t ldb, const typename cuda_gemm_scalar<Scalar>::type* beta, Scalar* C, int64_t ldc,
|
||||||
|
DeviceBuffer* workspace, cudaStream_t stream) {
|
||||||
|
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||||
|
constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
|
||||||
|
using AlphaType = typename cuda_gemm_scalar<Scalar>::type;
|
||||||
|
constexpr cudaDataType_t alpha_type = cuda_data_type<AlphaType>::value;
|
||||||
|
|
||||||
|
// Matmul descriptor.
|
||||||
|
cublasLtMatmulDesc_t matmul_desc = nullptr;
|
||||||
|
EIGEN_CUBLASLT_CHECK(cublasLtMatmulDescCreate(&matmul_desc, compute, alpha_type));
|
||||||
|
EIGEN_CUBLASLT_CHECK(
|
||||||
|
cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, sizeof(transA)));
|
||||||
|
EIGEN_CUBLASLT_CHECK(
|
||||||
|
cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, sizeof(transB)));
|
||||||
|
|
||||||
|
// Matrix layout descriptors (column-major).
|
||||||
|
// Physical layout dimensions: rows × cols with leading dimension lda/ldb/ldc.
|
||||||
|
const int64_t a_rows = (transA == CUBLAS_OP_N) ? m : k;
|
||||||
|
const int64_t b_rows = (transB == CUBLAS_OP_N) ? k : n;
|
||||||
|
|
||||||
|
cublasLtMatrixLayout_t layout_A = nullptr, layout_B = nullptr, layout_C = nullptr;
|
||||||
|
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_A, dtype, a_rows, (transA == CUBLAS_OP_N) ? k : m, lda));
|
||||||
|
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_B, dtype, b_rows, (transB == CUBLAS_OP_N) ? n : k, ldb));
|
||||||
|
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_C, dtype, m, n, ldc));
|
||||||
|
|
||||||
|
// Heuristic selection: query with generous workspace cap, allocate only what's needed.
|
||||||
|
cublasLtMatmulPreference_t preference = nullptr;
|
||||||
|
EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceCreate(&preference));
|
||||||
|
size_t max_ws = kCublasLtMaxWorkspaceBytes;
|
||||||
|
EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
|
||||||
|
&max_ws, sizeof(max_ws)));
|
||||||
|
|
||||||
|
cublasLtMatmulHeuristicResult_t result;
|
||||||
|
int returned_results = 0;
|
||||||
|
cublasStatus_t heuristic_status = cublasLtMatmulAlgoGetHeuristic(lt_handle, matmul_desc, layout_A, layout_B, layout_C,
|
||||||
|
layout_C, preference, 1, &result, &returned_results);
|
||||||
|
|
||||||
|
if (heuristic_status == CUBLAS_STATUS_SUCCESS && returned_results > 0) {
|
||||||
|
// cublasLt path: use the selected algorithm with lazy workspace.
|
||||||
|
const size_t needed = result.workspaceSize;
|
||||||
|
if (needed > workspace->size()) {
|
||||||
|
// Sync only when freeing an existing buffer that may be in use.
|
||||||
|
if (workspace->ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
|
||||||
|
*workspace = DeviceBuffer(needed);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_CUBLASLT_CHECK(cublasLtMatmul(lt_handle, matmul_desc, alpha, A, layout_A, B, layout_B, beta, C, layout_C, C,
|
||||||
|
layout_C, &result.algo, workspace->ptr, needed, stream));
|
||||||
|
} else {
|
||||||
|
// Fallback: cublasGemmEx for shapes/types that cublasLt cannot handle.
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_handle, transA, transB, static_cast<int>(m), static_cast<int>(n),
|
||||||
|
static_cast<int>(k), alpha, A, dtype, static_cast<int>(lda), B, dtype,
|
||||||
|
static_cast<int>(ldb), beta, C, dtype, static_cast<int>(ldc), compute,
|
||||||
|
cuda_gemm_algo()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cleanup descriptors.
|
||||||
|
EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceDestroy(preference));
|
||||||
|
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_C));
|
||||||
|
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_B));
|
||||||
|
EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_A));
|
||||||
|
EIGEN_CUBLASLT_CHECK(cublasLtMatmulDescDestroy(matmul_desc));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Type-specific cuBLAS wrappers ------------------------------------------
|
||||||
|
// cuBLAS uses separate functions per type (Strsm, Dtrsm, etc.).
|
||||||
|
// These overloaded wrappers allow calling cublasXtrsm/cublasXsymm/cublasXsyrk
|
||||||
|
// with any supported scalar type.
|
||||||
|
|
||||||
|
// TRSM wrappers
|
||||||
|
inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
|
||||||
|
cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha,
|
||||||
|
const float* A, int lda, float* B, int ldb) {
|
||||||
|
return cublasStrsm(h, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
|
||||||
|
cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha,
|
||||||
|
const double* A, int lda, double* B, int ldb) {
|
||||||
|
return cublasDtrsm(h, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
|
||||||
|
cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
|
||||||
|
const std::complex<float>* alpha, const std::complex<float>* A, int lda,
|
||||||
|
std::complex<float>* B, int ldb) {
|
||||||
|
return cublasCtrsm(h, side, uplo, trans, diag, m, n, reinterpret_cast<const cuComplex*>(alpha),
|
||||||
|
reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<cuComplex*>(B), ldb);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
|
||||||
|
cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
|
||||||
|
const std::complex<double>* alpha, const std::complex<double>* A, int lda,
|
||||||
|
std::complex<double>* B, int ldb) {
|
||||||
|
return cublasZtrsm(h, side, uplo, trans, diag, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
|
||||||
|
reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<cuDoubleComplex*>(B), ldb);
|
||||||
|
}
|
||||||
|
|
||||||
|
// SYMM wrappers (real → symm, complex → hemm)
|
||||||
|
inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
|
||||||
|
const float* alpha, const float* A, int lda, const float* B, int ldb,
|
||||||
|
const float* beta, float* C, int ldc) {
|
||||||
|
return cublasSsymm(h, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
|
||||||
|
const double* alpha, const double* A, int lda, const double* B, int ldb,
|
||||||
|
const double* beta, double* C, int ldc) {
|
||||||
|
return cublasDsymm(h, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
|
||||||
|
const std::complex<float>* alpha, const std::complex<float>* A, int lda,
|
||||||
|
const std::complex<float>* B, int ldb, const std::complex<float>* beta,
|
||||||
|
std::complex<float>* C, int ldc) {
|
||||||
|
return cublasChemm(h, side, uplo, m, n, reinterpret_cast<const cuComplex*>(alpha),
|
||||||
|
reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<const cuComplex*>(B), ldb,
|
||||||
|
reinterpret_cast<const cuComplex*>(beta), reinterpret_cast<cuComplex*>(C), ldc);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
|
||||||
|
const std::complex<double>* alpha, const std::complex<double>* A, int lda,
|
||||||
|
const std::complex<double>* B, int ldb, const std::complex<double>* beta,
|
||||||
|
std::complex<double>* C, int ldc) {
|
||||||
|
return cublasZhemm(h, side, uplo, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
|
||||||
|
reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<const cuDoubleComplex*>(B), ldb,
|
||||||
|
reinterpret_cast<const cuDoubleComplex*>(beta), reinterpret_cast<cuDoubleComplex*>(C), ldc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// SYRK wrappers (real → syrk, complex → herk)
|
||||||
|
inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
|
||||||
|
const float* alpha, const float* A, int lda, const float* beta, float* C, int ldc) {
|
||||||
|
return cublasSsyrk(h, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
|
||||||
|
const double* alpha, const double* A, int lda, const double* beta, double* C,
|
||||||
|
int ldc) {
|
||||||
|
return cublasDsyrk(h, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
|
||||||
|
const float* alpha, const std::complex<float>* A, int lda, const float* beta,
|
||||||
|
std::complex<float>* C, int ldc) {
|
||||||
|
return cublasCherk(h, uplo, trans, n, k, alpha, reinterpret_cast<const cuComplex*>(A), lda, beta,
|
||||||
|
reinterpret_cast<cuComplex*>(C), ldc);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
|
||||||
|
const double* alpha, const std::complex<double>* A, int lda, const double* beta,
|
||||||
|
std::complex<double>* C, int ldc) {
|
||||||
|
return cublasZherk(h, uplo, trans, n, k, alpha, reinterpret_cast<const cuDoubleComplex*>(A), lda, beta,
|
||||||
|
reinterpret_cast<cuDoubleComplex*>(C), ldc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// GEAM wrappers: C = alpha * op(A) + beta * op(B)
|
||||||
|
// Covers transpose, scale, matrix add/subtract in one call.
|
||||||
|
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||||
|
const float* alpha, const float* A, int lda, const float* beta, const float* B,
|
||||||
|
int ldb, float* C, int ldc) {
|
||||||
|
return cublasSgeam(h, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||||
|
const double* alpha, const double* A, int lda, const double* beta, const double* B,
|
||||||
|
int ldb, double* C, int ldc) {
|
||||||
|
return cublasDgeam(h, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||||
|
const std::complex<float>* alpha, const std::complex<float>* A, int lda,
|
||||||
|
const std::complex<float>* beta, const std::complex<float>* B, int ldb,
|
||||||
|
std::complex<float>* C, int ldc) {
|
||||||
|
return cublasCgeam(h, transa, transb, m, n, reinterpret_cast<const cuComplex*>(alpha),
|
||||||
|
reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<const cuComplex*>(beta),
|
||||||
|
reinterpret_cast<const cuComplex*>(B), ldb, reinterpret_cast<cuComplex*>(C), ldc);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cublasOperation_t transb, int m, int n,
|
||||||
|
const std::complex<double>* alpha, const std::complex<double>* A, int lda,
|
||||||
|
const std::complex<double>* beta, const std::complex<double>* B, int ldb,
|
||||||
|
std::complex<double>* C, int ldc) {
|
||||||
|
return cublasZgeam(h, transa, transb, m, n, reinterpret_cast<const cuDoubleComplex*>(alpha),
|
||||||
|
reinterpret_cast<const cuDoubleComplex*>(A), lda, reinterpret_cast<const cuDoubleComplex*>(beta),
|
||||||
|
reinterpret_cast<const cuDoubleComplex*>(B), ldb, reinterpret_cast<cuDoubleComplex*>(C), ldc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- cuBLAS Level-1 wrappers ------------------------------------------------
|
||||||
|
// Type-dispatched wrappers for BLAS-1 vector operations: dot, axpy, nrm2, scal, copy.
|
||||||
|
// These work with CUBLAS_POINTER_MODE_HOST or CUBLAS_POINTER_MODE_DEVICE depending
|
||||||
|
// on the caller's configuration. For device pointer mode, scalar result pointers
|
||||||
|
// (dot, nrm2) must point to device memory.
|
||||||
|
|
||||||
|
// dot: result = x^T * y (real) or x^H * y (complex conjugate dot)
|
||||||
|
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const float* x, int incx, const float* y, int incy,
|
||||||
|
float* result) {
|
||||||
|
return cublasSdot(h, n, x, incx, y, incy, result);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const double* x, int incx, const double* y, int incy,
|
||||||
|
double* result) {
|
||||||
|
return cublasDdot(h, n, x, incx, y, incy, result);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const std::complex<float>* x, int incx,
|
||||||
|
const std::complex<float>* y, int incy, std::complex<float>* result) {
|
||||||
|
return cublasCdotc(h, n, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<const cuComplex*>(y), incy,
|
||||||
|
reinterpret_cast<cuComplex*>(result));
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const std::complex<double>* x, int incx,
|
||||||
|
const std::complex<double>* y, int incy, std::complex<double>* result) {
|
||||||
|
return cublasZdotc(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx,
|
||||||
|
reinterpret_cast<const cuDoubleComplex*>(y), incy, reinterpret_cast<cuDoubleComplex*>(result));
|
||||||
|
}
|
||||||
|
|
||||||
|
// nrm2: result = ||x||_2 (always returns real)
|
||||||
|
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const float* x, int incx, float* result) {
|
||||||
|
return cublasSnrm2(h, n, x, incx, result);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const double* x, int incx, double* result) {
|
||||||
|
return cublasDnrm2(h, n, x, incx, result);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const std::complex<float>* x, int incx, float* result) {
|
||||||
|
return cublasScnrm2(h, n, reinterpret_cast<const cuComplex*>(x), incx, result);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const std::complex<double>* x, int incx, double* result) {
|
||||||
|
return cublasDznrm2(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
// axpy: y += alpha * x
|
||||||
|
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const float* alpha, const float* x, int incx, float* y,
|
||||||
|
int incy) {
|
||||||
|
return cublasSaxpy(h, n, alpha, x, incx, y, incy);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const double* alpha, const double* x, int incx, double* y,
|
||||||
|
int incy) {
|
||||||
|
return cublasDaxpy(h, n, alpha, x, incx, y, incy);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const std::complex<float>* alpha,
|
||||||
|
const std::complex<float>* x, int incx, std::complex<float>* y, int incy) {
|
||||||
|
return cublasCaxpy(h, n, reinterpret_cast<const cuComplex*>(alpha), reinterpret_cast<const cuComplex*>(x), incx,
|
||||||
|
reinterpret_cast<cuComplex*>(y), incy);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const std::complex<double>* alpha,
|
||||||
|
const std::complex<double>* x, int incx, std::complex<double>* y, int incy) {
|
||||||
|
return cublasZaxpy(h, n, reinterpret_cast<const cuDoubleComplex*>(alpha), reinterpret_cast<const cuDoubleComplex*>(x),
|
||||||
|
incx, reinterpret_cast<cuDoubleComplex*>(y), incy);
|
||||||
|
}
|
||||||
|
|
||||||
|
// scal: x *= alpha
|
||||||
|
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const float* alpha, float* x, int incx) {
|
||||||
|
return cublasSscal(h, n, alpha, x, incx);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const double* alpha, double* x, int incx) {
|
||||||
|
return cublasDscal(h, n, alpha, x, incx);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const std::complex<float>* alpha, std::complex<float>* x,
|
||||||
|
int incx) {
|
||||||
|
return cublasCscal(h, n, reinterpret_cast<const cuComplex*>(alpha), reinterpret_cast<cuComplex*>(x), incx);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const std::complex<double>* alpha, std::complex<double>* x,
|
||||||
|
int incx) {
|
||||||
|
return cublasZscal(h, n, reinterpret_cast<const cuDoubleComplex*>(alpha), reinterpret_cast<cuDoubleComplex*>(x),
|
||||||
|
incx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy: y = x
|
||||||
|
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const float* x, int incx, float* y, int incy) {
|
||||||
|
return cublasScopy(h, n, x, incx, y, incy);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const double* x, int incx, double* y, int incy) {
|
||||||
|
return cublasDcopy(h, n, x, incx, y, incy);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const std::complex<float>* x, int incx,
|
||||||
|
std::complex<float>* y, int incy) {
|
||||||
|
return cublasCcopy(h, n, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<cuComplex*>(y), incy);
|
||||||
|
}
|
||||||
|
inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const std::complex<double>* x, int incx,
|
||||||
|
std::complex<double>* y, int incy) {
|
||||||
|
return cublasZcopy(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx, reinterpret_cast<cuDoubleComplex*>(y),
|
||||||
|
incy);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_CUBLAS_SUPPORT_H
|
||||||
134
Eigen/src/GPU/CuDssSupport.h
Normal file
134
Eigen/src/GPU/CuDssSupport.h
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// cuDSS support utilities: error checking macro, type mapping.
|
||||||
|
//
|
||||||
|
// cuDSS is NVIDIA's sparse direct solver library, supporting Cholesky (LL^T),
|
||||||
|
// LDL^T, and LU factorization on GPU. It requires CUDA 12.0+ and is
|
||||||
|
// distributed separately from the CUDA Toolkit.
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_CUDSS_SUPPORT_H
|
||||||
|
#define EIGEN_GPU_CUDSS_SUPPORT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSupport.h"
|
||||||
|
#include <cudss.h>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// ---- Error checking ---------------------------------------------------------
|
||||||
|
|
||||||
|
#define EIGEN_CUDSS_CHECK(x) \
|
||||||
|
do { \
|
||||||
|
cudssStatus_t _s = (x); \
|
||||||
|
eigen_assert(_s == CUDSS_STATUS_SUCCESS && "cuDSS call failed: " #x); \
|
||||||
|
EIGEN_UNUSED_VARIABLE(_s); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
// ---- Scalar → cudssMatrixType_t for SPD/HPD ---------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct cudss_spd_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cudss_spd_type<float> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_spd_type<double> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_spd_type<std::complex<float>> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_spd_type<std::complex<double>> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- Scalar → cudssMatrixType_t for symmetric/Hermitian ---------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct cudss_symmetric_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cudss_symmetric_type<float> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_symmetric_type<double> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_symmetric_type<std::complex<float>> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_symmetric_type<std::complex<double>> {
|
||||||
|
static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- StorageIndex → cudaDataType_t ------------------------------------------
|
||||||
|
|
||||||
|
template <typename StorageIndex>
|
||||||
|
struct cudss_index_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cudss_index_type<int> {
|
||||||
|
static constexpr cudaDataType_t value = CUDA_R_32I;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_index_type<int64_t> {
|
||||||
|
static constexpr cudaDataType_t value = CUDA_R_64I;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- UpLo → cudssMatrixViewType_t -------------------------------------------
|
||||||
|
// For symmetric matrices stored as CSC (ColMajor), cuDSS sees CSR of A^T.
|
||||||
|
// Since A = A^T, the data is the same, but the triangle view must be swapped.
|
||||||
|
|
||||||
|
template <int UpLo, int StorageOrder>
|
||||||
|
struct cudss_view_type;
|
||||||
|
|
||||||
|
// ColMajor (CSC) passed as CSR: lower ↔ upper swap.
|
||||||
|
template <>
|
||||||
|
struct cudss_view_type<Lower, ColMajor> {
|
||||||
|
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_view_type<Upper, ColMajor> {
|
||||||
|
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
|
||||||
|
};
|
||||||
|
|
||||||
|
// RowMajor (CSR) passed directly: no swap needed.
|
||||||
|
template <>
|
||||||
|
struct cudss_view_type<Lower, RowMajor> {
|
||||||
|
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cudss_view_type<Upper, RowMajor> {
|
||||||
|
static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
|
||||||
|
// ---- Ordering enum ----------------------------------------------------------
|
||||||
|
|
||||||
|
enum class GpuSparseOrdering {
|
||||||
|
AMD, // Default fill-reducing ordering
|
||||||
|
METIS, // METIS nested dissection
|
||||||
|
RCM // Reverse Cuthill-McKee
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_CUDSS_SUPPORT_H
|
||||||
103
Eigen/src/GPU/CuFftSupport.h
Normal file
103
Eigen/src/GPU/CuFftSupport.h
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// cuFFT support utilities: error checking macro, type mapping.
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_CUFFT_SUPPORT_H
|
||||||
|
#define EIGEN_GPU_CUFFT_SUPPORT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSupport.h"
|
||||||
|
#include <cufft.h>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// ---- Error checking ---------------------------------------------------------
|
||||||
|
|
||||||
|
#define EIGEN_CUFFT_CHECK(x) \
|
||||||
|
do { \
|
||||||
|
cufftResult _r = (x); \
|
||||||
|
eigen_assert(_r == CUFFT_SUCCESS && "cuFFT call failed: " #x); \
|
||||||
|
EIGEN_UNUSED_VARIABLE(_r); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
// ---- Scalar → cufftType traits ----------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct cufft_c2c_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cufft_c2c_type<float> {
|
||||||
|
static constexpr cufftType value = CUFFT_C2C;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cufft_c2c_type<double> {
|
||||||
|
static constexpr cufftType value = CUFFT_Z2Z;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct cufft_r2c_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cufft_r2c_type<float> {
|
||||||
|
static constexpr cufftType value = CUFFT_R2C;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cufft_r2c_type<double> {
|
||||||
|
static constexpr cufftType value = CUFFT_D2Z;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct cufft_c2r_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cufft_c2r_type<float> {
|
||||||
|
static constexpr cufftType value = CUFFT_C2R;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cufft_c2r_type<double> {
|
||||||
|
static constexpr cufftType value = CUFFT_Z2D;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- Type-dispatched cuFFT execution ----------------------------------------
|
||||||
|
|
||||||
|
// C2C
|
||||||
|
inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<float>* in, std::complex<float>* out,
|
||||||
|
int direction) {
|
||||||
|
return cufftExecC2C(plan, reinterpret_cast<cufftComplex*>(in), reinterpret_cast<cufftComplex*>(out), direction);
|
||||||
|
}
|
||||||
|
inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<double>* in, std::complex<double>* out,
|
||||||
|
int direction) {
|
||||||
|
return cufftExecZ2Z(plan, reinterpret_cast<cufftDoubleComplex*>(in), reinterpret_cast<cufftDoubleComplex*>(out),
|
||||||
|
direction);
|
||||||
|
}
|
||||||
|
|
||||||
|
// R2C
|
||||||
|
inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, float* in, std::complex<float>* out) {
|
||||||
|
return cufftExecR2C(plan, in, reinterpret_cast<cufftComplex*>(out));
|
||||||
|
}
|
||||||
|
inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, double* in, std::complex<double>* out) {
|
||||||
|
return cufftExecD2Z(plan, in, reinterpret_cast<cufftDoubleComplex*>(out));
|
||||||
|
}
|
||||||
|
|
||||||
|
// C2R
|
||||||
|
inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<float>* in, float* out) {
|
||||||
|
return cufftExecC2R(plan, reinterpret_cast<cufftComplex*>(in), out);
|
||||||
|
}
|
||||||
|
inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<double>* in, double* out) {
|
||||||
|
return cufftExecZ2D(plan, reinterpret_cast<cufftDoubleComplex*>(in), out);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_CUFFT_SUPPORT_H
|
||||||
159
Eigen/src/GPU/CuSolverSupport.h
Normal file
159
Eigen/src/GPU/CuSolverSupport.h
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// cuSOLVER-specific support types:
|
||||||
|
// - cuSOLVER error-checking macro
|
||||||
|
// - RAII wrapper for cusolverDnParams
|
||||||
|
// - Scalar → cudaDataType_t mapping
|
||||||
|
// - (UpLo, StorageOrder) → cublasFillMode_t mapping
|
||||||
|
//
|
||||||
|
// Generic CUDA runtime utilities (DeviceBuffer, EIGEN_CUDA_RUNTIME_CHECK)
|
||||||
|
// are in GpuSupport.h.
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_CUSOLVER_SUPPORT_H
|
||||||
|
#define EIGEN_GPU_CUSOLVER_SUPPORT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSupport.h"
|
||||||
|
#include <cusolverDn.h>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// ---- Error-checking macros --------------------------------------------------
|
||||||
|
|
||||||
|
#define EIGEN_CUSOLVER_CHECK(expr) \
|
||||||
|
do { \
|
||||||
|
cusolverStatus_t _s = (expr); \
|
||||||
|
eigen_assert(_s == CUSOLVER_STATUS_SUCCESS && "cuSOLVER call failed"); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
// ---- RAII: cusolverDnParams -------------------------------------------------
|
||||||
|
|
||||||
|
struct CusolverParams {
|
||||||
|
cusolverDnParams_t p = nullptr;
|
||||||
|
|
||||||
|
CusolverParams() { EIGEN_CUSOLVER_CHECK(cusolverDnCreateParams(&p)); }
|
||||||
|
|
||||||
|
~CusolverParams() {
|
||||||
|
if (p) (void)cusolverDnDestroyParams(p); // destructor: can't propagate
|
||||||
|
}
|
||||||
|
|
||||||
|
// Move-only.
|
||||||
|
CusolverParams(CusolverParams&& o) noexcept : p(o.p) { o.p = nullptr; }
|
||||||
|
CusolverParams& operator=(CusolverParams&& o) noexcept {
|
||||||
|
if (this != &o) {
|
||||||
|
if (p) (void)cusolverDnDestroyParams(p);
|
||||||
|
p = o.p;
|
||||||
|
o.p = nullptr;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
CusolverParams(const CusolverParams&) = delete;
|
||||||
|
CusolverParams& operator=(const CusolverParams&) = delete;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- Scalar → cudaDataType_t ------------------------------------------------
|
||||||
|
// Alias for backward compatibility. The canonical trait is cuda_data_type<> in GpuSupport.h.
|
||||||
|
template <typename Scalar>
|
||||||
|
using cusolver_data_type = cuda_data_type<Scalar>;
|
||||||
|
|
||||||
|
// ---- (UpLo, StorageOrder) → cublasFillMode_t --------------------------------
|
||||||
|
// cuSOLVER always interprets the matrix as column-major. A row-major matrix A
|
||||||
|
// appears as A^T to cuSOLVER, so the upper/lower triangle is swapped.
|
||||||
|
|
||||||
|
template <int UpLo, int StorageOrder>
|
||||||
|
struct cusolver_fill_mode;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cusolver_fill_mode<Lower, ColMajor> {
|
||||||
|
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cusolver_fill_mode<Upper, ColMajor> {
|
||||||
|
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_UPPER;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cusolver_fill_mode<Lower, RowMajor> {
|
||||||
|
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_UPPER;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cusolver_fill_mode<Upper, RowMajor> {
|
||||||
|
static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- Type-specific cuSOLVER wrappers ----------------------------------------
|
||||||
|
// cuSOLVER does not provide generic X variants for ormqr/unmqr. These overloaded
|
||||||
|
// wrappers dispatch to the correct type-specific function.
|
||||||
|
// For real types: ormqr (orthogonal Q). For complex types: unmqr (unitary Q).
|
||||||
|
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||||
|
int n, int k, const float* A, int lda, const float* tau, float* C, int ldc,
|
||||||
|
float* work, int lwork, int* info) {
|
||||||
|
return cusolverDnSormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
|
||||||
|
}
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||||
|
int n, int k, const double* A, int lda, const double* tau, double* C, int ldc,
|
||||||
|
double* work, int lwork, int* info) {
|
||||||
|
return cusolverDnDormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
|
||||||
|
}
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||||
|
int n, int k, const std::complex<float>* A, int lda,
|
||||||
|
const std::complex<float>* tau, std::complex<float>* C, int ldc,
|
||||||
|
std::complex<float>* work, int lwork, int* info) {
|
||||||
|
return cusolverDnCunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
|
||||||
|
reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<cuComplex*>(C), ldc,
|
||||||
|
reinterpret_cast<cuComplex*>(work), lwork, info);
|
||||||
|
}
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
|
||||||
|
int n, int k, const std::complex<double>* A, int lda,
|
||||||
|
const std::complex<double>* tau, std::complex<double>* C, int ldc,
|
||||||
|
std::complex<double>* work, int lwork, int* info) {
|
||||||
|
return cusolverDnZunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
|
||||||
|
reinterpret_cast<const cuDoubleComplex*>(tau), reinterpret_cast<cuDoubleComplex*>(C), ldc,
|
||||||
|
reinterpret_cast<cuDoubleComplex*>(work), lwork, info);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Buffer size wrappers for ormqr/unmqr.
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||||
|
cublasOperation_t trans, int m, int n, int k, const float* A,
|
||||||
|
int lda, const float* tau, const float* C, int ldc, int* lwork) {
|
||||||
|
return cusolverDnSormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
|
||||||
|
}
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||||
|
cublasOperation_t trans, int m, int n, int k, const double* A,
|
||||||
|
int lda, const double* tau, const double* C, int ldc, int* lwork) {
|
||||||
|
return cusolverDnDormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
|
||||||
|
}
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||||
|
cublasOperation_t trans, int m, int n, int k,
|
||||||
|
const std::complex<float>* A, int lda,
|
||||||
|
const std::complex<float>* tau, const std::complex<float>* C,
|
||||||
|
int ldc, int* lwork) {
|
||||||
|
return cusolverDnCunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
|
||||||
|
reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<const cuComplex*>(C),
|
||||||
|
ldc, lwork);
|
||||||
|
}
|
||||||
|
inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
|
||||||
|
cublasOperation_t trans, int m, int n, int k,
|
||||||
|
const std::complex<double>* A, int lda,
|
||||||
|
const std::complex<double>* tau, const std::complex<double>* C,
|
||||||
|
int ldc, int* lwork) {
|
||||||
|
return cusolverDnZunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
|
||||||
|
reinterpret_cast<const cuDoubleComplex*>(tau),
|
||||||
|
reinterpret_cast<const cuDoubleComplex*>(C), ldc, lwork);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_CUSOLVER_SUPPORT_H
|
||||||
34
Eigen/src/GPU/CuSparseSupport.h
Normal file
34
Eigen/src/GPU/CuSparseSupport.h
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// cuSPARSE support utilities: error checking macro.
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_CUSPARSE_SUPPORT_H
|
||||||
|
#define EIGEN_GPU_CUSPARSE_SUPPORT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSupport.h"
|
||||||
|
#include <cusparse.h>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
#define EIGEN_CUSPARSE_CHECK(x) \
|
||||||
|
do { \
|
||||||
|
cusparseStatus_t _s = (x); \
|
||||||
|
eigen_assert(_s == CUSPARSE_STATUS_SUCCESS && "cuSPARSE call failed: " #x); \
|
||||||
|
EIGEN_UNUSED_VARIABLE(_s); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_CUSPARSE_SUPPORT_H
|
||||||
146
Eigen/src/GPU/DeviceBlasExpr.h
Normal file
146
Eigen/src/GPU/DeviceBlasExpr.h
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// BLAS Level 3 expression types for DeviceMatrix (beyond GEMM):
|
||||||
|
// TrsmExpr → cublasXtrsm (triangular solve)
|
||||||
|
// SymmExpr → cublasXsymm (symmetric multiply, real)
|
||||||
|
// → cublasXhemm (Hermitian multiply, complex)
|
||||||
|
// SyrkExpr → cublasXsyrk (symmetric rank-k update, real)
|
||||||
|
// → cublasXherk (Hermitian rank-k update, complex)
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_DEVICE_BLAS_EXPR_H
|
||||||
|
#define EIGEN_GPU_DEVICE_BLAS_EXPR_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class DeviceMatrix;
|
||||||
|
|
||||||
|
// ---- DeviceTriangularView ---------------------------------------------------
|
||||||
|
// d_A.triangularView<Lower>() → view with .solve(d_B)
|
||||||
|
|
||||||
|
template <typename Scalar_, int UpLo_>
|
||||||
|
class DeviceTriangularView {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
enum { UpLo = UpLo_ };
|
||||||
|
|
||||||
|
explicit DeviceTriangularView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||||
|
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||||
|
|
||||||
|
/** Build a TRSM solve expression. */
|
||||||
|
TrsmExpr<Scalar, UpLo_> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DeviceMatrix<Scalar>& mat_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- TrsmExpr: triangularView<UpLo>().solve(B) → cublasXtrsm ---------------
|
||||||
|
|
||||||
|
template <typename Scalar_, int UpLo_>
|
||||||
|
class TrsmExpr {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
enum { UpLo = UpLo_ };
|
||||||
|
|
||||||
|
TrsmExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
|
||||||
|
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||||
|
const DeviceMatrix<Scalar>& rhs() const { return B_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DeviceMatrix<Scalar>& A_;
|
||||||
|
const DeviceMatrix<Scalar>& B_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- DeviceSelfAdjointView --------------------------------------------------
|
||||||
|
// d_A.selfadjointView<Lower>() → view that can multiply: view * d_B
|
||||||
|
|
||||||
|
template <typename Scalar_, int UpLo_>
|
||||||
|
class DeviceSelfAdjointView {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
enum { UpLo = UpLo_ };
|
||||||
|
|
||||||
|
explicit DeviceSelfAdjointView(DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||||
|
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||||
|
DeviceMatrix<Scalar>& matrix() { return mat_; }
|
||||||
|
|
||||||
|
/** Rank-k update: C.selfadjointView<Lower>().rankUpdate(A, alpha)
|
||||||
|
* computes C = alpha * A * A^H + C (lower triangle only).
|
||||||
|
* Maps to cublasXsyrk (real) or cublasXherk (complex). */
|
||||||
|
void rankUpdate(const DeviceMatrix<Scalar>& A, RealScalar alpha = RealScalar(1));
|
||||||
|
|
||||||
|
private:
|
||||||
|
DeviceMatrix<Scalar>& mat_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Const variant for multiplication only (no rankUpdate).
|
||||||
|
template <typename Scalar_, int UpLo_>
|
||||||
|
class ConstDeviceSelfAdjointView {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
enum { UpLo = UpLo_ };
|
||||||
|
|
||||||
|
explicit ConstDeviceSelfAdjointView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||||
|
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DeviceMatrix<Scalar>& mat_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- SymmExpr: selfadjointView<UpLo>() * B → cublasXsymm/Xhemm ------------
|
||||||
|
|
||||||
|
template <typename Scalar_, int UpLo_>
|
||||||
|
class SymmExpr {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
enum { UpLo = UpLo_ };
|
||||||
|
|
||||||
|
SymmExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
|
||||||
|
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||||
|
const DeviceMatrix<Scalar>& rhs() const { return B_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DeviceMatrix<Scalar>& A_;
|
||||||
|
const DeviceMatrix<Scalar>& B_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// operator*: DeviceSelfAdjointView * DeviceMatrix → SymmExpr (mutable and const variants)
|
||||||
|
template <typename S, int UpLo>
|
||||||
|
SymmExpr<S, UpLo> operator*(const DeviceSelfAdjointView<S, UpLo>& a, const DeviceMatrix<S>& b) {
|
||||||
|
return {a.matrix(), b};
|
||||||
|
}
|
||||||
|
template <typename S, int UpLo>
|
||||||
|
SymmExpr<S, UpLo> operator*(const ConstDeviceSelfAdjointView<S, UpLo>& a, const DeviceMatrix<S>& b) {
|
||||||
|
return {a.matrix(), b};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SyrkExpr: rankUpdate(A) → cublasXsyrk/Xherk ---------------------------
|
||||||
|
// C.rankUpdate(A) computes C += A * A^H (or A^H * A depending on convention).
|
||||||
|
|
||||||
|
template <typename Scalar_, int UpLo_>
|
||||||
|
class SyrkExpr {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
enum { UpLo = UpLo_ };
|
||||||
|
|
||||||
|
SyrkExpr(const DeviceMatrix<Scalar>& A) : A_(A) {}
|
||||||
|
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DeviceMatrix<Scalar>& A_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_DEVICE_BLAS_EXPR_H
|
||||||
787
Eigen/src/GPU/DeviceDispatch.h
Normal file
787
Eigen/src/GPU/DeviceDispatch.h
Normal file
@@ -0,0 +1,787 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Dispatch functions that map DeviceMatrix expressions to NVIDIA library calls.
|
||||||
|
//
|
||||||
|
// dispatch_gemm() — GemmExpr → cublasXgemm
|
||||||
|
//
|
||||||
|
// Each function documents the exact library call and parameters.
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_DEVICE_DISPATCH_H
|
||||||
|
#define EIGEN_GPU_DEVICE_DISPATCH_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./DeviceExpr.h"
|
||||||
|
#include "./DeviceBlasExpr.h"
|
||||||
|
#include "./DeviceSolverExpr.h"
|
||||||
|
#include "./GpuContext.h"
|
||||||
|
#include "./CuSolverSupport.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// ---- GEMM dispatch ----------------------------------------------------------
|
||||||
|
// GemmExpr<Lhs, Rhs> → cublasLtMatmul via GpuContext.
|
||||||
|
//
|
||||||
|
// Uses cublasLtMatmul for 64-bit dimension support and heuristic algorithm
|
||||||
|
// selection. All scalar types (float, double, complex<float>, complex<double>)
|
||||||
|
// are handled via cudaDataType_t.
|
||||||
|
|
||||||
|
template <typename Lhs, typename Rhs>
|
||||||
|
void dispatch_gemm(
|
||||||
|
GpuContext& ctx, DeviceMatrix<typename device_expr_traits<Lhs>::scalar_type>& dst, const GemmExpr<Lhs, Rhs>& expr,
|
||||||
|
typename device_expr_traits<Lhs>::scalar_type beta_val,
|
||||||
|
typename device_expr_traits<Lhs>::scalar_type alpha_scale = typename device_expr_traits<Lhs>::scalar_type(1)) {
|
||||||
|
using Scalar = typename device_expr_traits<Lhs>::scalar_type;
|
||||||
|
using traits_lhs = device_expr_traits<Lhs>;
|
||||||
|
using traits_rhs = device_expr_traits<Rhs>;
|
||||||
|
|
||||||
|
const DeviceMatrix<Scalar>& A = traits_lhs::matrix(expr.lhs());
|
||||||
|
const DeviceMatrix<Scalar>& B = traits_rhs::matrix(expr.rhs());
|
||||||
|
|
||||||
|
// cuBLAS GEMM: C must not alias A or B (undefined behavior).
|
||||||
|
eigen_assert(dst.data() != A.data() && "GEMM: output aliases left operand (use a temporary)");
|
||||||
|
eigen_assert(dst.data() != B.data() && "GEMM: output aliases right operand (use a temporary)");
|
||||||
|
|
||||||
|
constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op);
|
||||||
|
constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op);
|
||||||
|
|
||||||
|
// GEMM dimensions: C(m,n) = op(A)(m,k) * op(B)(k,n)
|
||||||
|
// op(A) has dimensions (A.rows, A.cols) if NoTrans, (A.cols, A.rows) if Trans/ConjTrans.
|
||||||
|
const int64_t m = (traits_lhs::op == GpuOp::NoTrans) ? A.rows() : A.cols();
|
||||||
|
const int64_t k = (traits_lhs::op == GpuOp::NoTrans) ? A.cols() : A.rows();
|
||||||
|
const int64_t n = (traits_rhs::op == GpuOp::NoTrans) ? B.cols() : B.rows();
|
||||||
|
const int64_t rhs_k = (traits_rhs::op == GpuOp::NoTrans) ? B.rows() : B.cols();
|
||||||
|
|
||||||
|
eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch");
|
||||||
|
|
||||||
|
const int64_t lda = A.rows();
|
||||||
|
const int64_t ldb = B.rows();
|
||||||
|
|
||||||
|
// Serialize all accesses to the destination buffer on this stream.
|
||||||
|
if (!dst.empty()) {
|
||||||
|
dst.waitReady(ctx.stream());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate or resize destination.
|
||||||
|
const bool resized = dst.empty() || dst.rows() != m || dst.cols() != n;
|
||||||
|
if (resized) {
|
||||||
|
dst.resize(m, n);
|
||||||
|
}
|
||||||
|
const int64_t ldc = dst.rows();
|
||||||
|
|
||||||
|
// cuBLAS requires alpha/beta as float for half/bfloat16 inputs.
|
||||||
|
using GemmScalar = typename cuda_gemm_scalar<Scalar>::type;
|
||||||
|
GemmScalar alpha_gval =
|
||||||
|
static_cast<GemmScalar>(alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs()));
|
||||||
|
GemmScalar beta_gval = static_cast<GemmScalar>(beta_val);
|
||||||
|
|
||||||
|
// Wait for operands to be ready on this stream.
|
||||||
|
A.waitReady(ctx.stream());
|
||||||
|
B.waitReady(ctx.stream());
|
||||||
|
|
||||||
|
// If there is no existing valid destination to accumulate into, treat it as
|
||||||
|
// zero rather than reading uninitialized memory.
|
||||||
|
if (resized && beta_gval != GemmScalar(0) && dst.sizeInBytes() > 0) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
|
||||||
|
}
|
||||||
|
|
||||||
|
cublaslt_gemm<Scalar>(ctx.cublasLtHandle(), ctx.cublasHandle(), transA, transB, m, n, k, &alpha_gval, A.data(), lda,
|
||||||
|
B.data(), ldb, &beta_gval, dst.data(), ldc, ctx.gemmWorkspace(), ctx.stream());
|
||||||
|
|
||||||
|
dst.recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- LLT solve dispatch -----------------------------------------------------
|
||||||
|
// LltSolveExpr → cusolverDnXpotrf (factorize) + cusolverDnXpotrs (solve).
|
||||||
|
// No caching — factor and workspace are temporary. Syncs to check info.
|
||||||
|
|
||||||
|
template <typename Scalar, int UpLo>
|
||||||
|
void dispatch_llt_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LltSolveExpr<Scalar, UpLo>& expr) {
|
||||||
|
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||||
|
const DeviceMatrix<Scalar>& B = expr.rhs();
|
||||||
|
|
||||||
|
eigen_assert(A.rows() == A.cols() && "LLT requires a square matrix");
|
||||||
|
eigen_assert(B.rows() == A.rows() && "LLT solve: RHS rows must match matrix size");
|
||||||
|
|
||||||
|
const Index n = A.rows();
|
||||||
|
const int64_t nrhs = static_cast<int64_t>(B.cols());
|
||||||
|
|
||||||
|
// Zero-size fast paths: no work, just resize dst.
|
||||||
|
// Wait on dst before resize to avoid freeing memory another stream is using.
|
||||||
|
if (n == 0 || nrhs == 0) {
|
||||||
|
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||||
|
dst.resize(n == 0 ? 0 : n, B.cols());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
A.waitReady(ctx.stream());
|
||||||
|
B.waitReady(ctx.stream());
|
||||||
|
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||||
|
|
||||||
|
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||||
|
constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo, ColMajor>::value;
|
||||||
|
const int64_t lda = static_cast<int64_t>(A.rows());
|
||||||
|
const int64_t ldb = static_cast<int64_t>(B.rows());
|
||||||
|
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||||
|
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||||
|
|
||||||
|
// D2D copy A → factor buffer (potrf is in-place).
|
||||||
|
DeviceBuffer d_factor(mat_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_factor.ptr, A.data(), mat_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||||
|
|
||||||
|
// Query workspace and factorize.
|
||||||
|
CusolverParams params;
|
||||||
|
DeviceBuffer d_factorize_info(sizeof(int));
|
||||||
|
size_t dev_ws = 0, host_ws = 0;
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), dtype,
|
||||||
|
d_factor.ptr, lda, dtype, &dev_ws, &host_ws));
|
||||||
|
|
||||||
|
DeviceBuffer d_workspace(dev_ws);
|
||||||
|
std::vector<char> h_workspace(host_ws);
|
||||||
|
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(
|
||||||
|
ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), dtype, d_factor.ptr, lda, dtype, d_workspace.ptr,
|
||||||
|
dev_ws, host_ws > 0 ? h_workspace.data() : nullptr, host_ws, static_cast<int*>(d_factorize_info.ptr)));
|
||||||
|
|
||||||
|
// Check factorization info before proceeding to solve.
|
||||||
|
int factorize_info = 0;
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(&factorize_info, d_factorize_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
|
||||||
|
eigen_assert(factorize_info == 0 && "cuSOLVER LLT factorization failed (matrix not positive definite)");
|
||||||
|
|
||||||
|
// D2D copy B → dst (potrs is in-place on the RHS).
|
||||||
|
dst.resize(n, B.cols());
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||||
|
|
||||||
|
// Solve.
|
||||||
|
DeviceBuffer d_solve_info(sizeof(int));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, static_cast<int64_t>(n), nrhs, dtype,
|
||||||
|
d_factor.ptr, lda, dtype, dst.data(), static_cast<int64_t>(dst.rows()),
|
||||||
|
static_cast<int*>(d_solve_info.ptr)));
|
||||||
|
|
||||||
|
// Sync to ensure workspace locals can be freed safely.
|
||||||
|
int solve_info = 0;
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(&solve_info, d_solve_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
|
||||||
|
eigen_assert(solve_info == 0 && "cuSOLVER LLT solve failed");
|
||||||
|
|
||||||
|
dst.recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- LU solve dispatch ------------------------------------------------------
|
||||||
|
// LuSolveExpr → cusolverDnXgetrf (factorize) + cusolverDnXgetrs (solve).
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void dispatch_lu_solve(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const LuSolveExpr<Scalar>& expr) {
|
||||||
|
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||||
|
const DeviceMatrix<Scalar>& B = expr.rhs();
|
||||||
|
|
||||||
|
eigen_assert(A.rows() == A.cols() && "LU requires a square matrix");
|
||||||
|
eigen_assert(B.rows() == A.rows() && "LU solve: RHS rows must match matrix size");
|
||||||
|
|
||||||
|
const Index n = A.rows();
|
||||||
|
const int64_t nrhs = static_cast<int64_t>(B.cols());
|
||||||
|
|
||||||
|
if (n == 0 || nrhs == 0) {
|
||||||
|
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||||
|
dst.resize(n == 0 ? 0 : n, B.cols());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
A.waitReady(ctx.stream());
|
||||||
|
B.waitReady(ctx.stream());
|
||||||
|
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||||
|
|
||||||
|
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||||
|
const int64_t lda = static_cast<int64_t>(A.rows());
|
||||||
|
const int64_t ldb = static_cast<int64_t>(B.rows());
|
||||||
|
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||||
|
const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||||
|
const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t);
|
||||||
|
|
||||||
|
// D2D copy A → LU buffer (getrf is in-place).
|
||||||
|
DeviceBuffer d_lu(mat_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu.ptr, A.data(), mat_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||||
|
|
||||||
|
DeviceBuffer d_ipiv(ipiv_bytes);
|
||||||
|
|
||||||
|
// Query workspace and factorize.
|
||||||
|
CusolverParams params;
|
||||||
|
DeviceBuffer d_factorize_info(sizeof(int));
|
||||||
|
size_t dev_ws = 0, host_ws = 0;
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(ctx.cusolverHandle(), params.p, static_cast<int64_t>(n),
|
||||||
|
static_cast<int64_t>(n), dtype, d_lu.ptr, lda, dtype, &dev_ws,
|
||||||
|
&host_ws));
|
||||||
|
|
||||||
|
DeviceBuffer d_workspace(dev_ws);
|
||||||
|
std::vector<char> h_workspace(host_ws);
|
||||||
|
|
||||||
|
EIGEN_CUSOLVER_CHECK(
|
||||||
|
cusolverDnXgetrf(ctx.cusolverHandle(), params.p, static_cast<int64_t>(n), static_cast<int64_t>(n), dtype,
|
||||||
|
d_lu.ptr, lda, static_cast<int64_t*>(d_ipiv.ptr), dtype, d_workspace.ptr, dev_ws,
|
||||||
|
host_ws > 0 ? h_workspace.data() : nullptr, host_ws, static_cast<int*>(d_factorize_info.ptr)));
|
||||||
|
|
||||||
|
// Check factorization info before proceeding to solve.
|
||||||
|
int factorize_info = 0;
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(&factorize_info, d_factorize_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
|
||||||
|
eigen_assert(factorize_info == 0 && "cuSOLVER LU factorization failed (singular matrix)");
|
||||||
|
|
||||||
|
// D2D copy B → dst (getrs is in-place on the RHS).
|
||||||
|
dst.resize(n, B.cols());
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||||
|
|
||||||
|
// Solve (NoTranspose).
|
||||||
|
DeviceBuffer d_solve_info(sizeof(int));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, static_cast<int64_t>(n), nrhs,
|
||||||
|
dtype, d_lu.ptr, lda, static_cast<const int64_t*>(d_ipiv.ptr), dtype,
|
||||||
|
dst.data(), static_cast<int64_t>(dst.rows()),
|
||||||
|
static_cast<int*>(d_solve_info.ptr)));
|
||||||
|
|
||||||
|
// Sync to ensure workspace locals can be freed safely.
|
||||||
|
int solve_info = 0;
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(&solve_info, d_solve_info.ptr, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
|
||||||
|
eigen_assert(solve_info == 0 && "cuSOLVER LU solve failed");
|
||||||
|
|
||||||
|
dst.recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- TRSM dispatch ----------------------------------------------------------
|
||||||
|
// TrsmExpr → cublasXtrsm: solve op(A) * X = B where A is triangular.
|
||||||
|
// Side=Left, Diag=NonUnit. A is square, B is n×nrhs.
|
||||||
|
|
||||||
|
template <typename Scalar, int UpLo>
|
||||||
|
void dispatch_trsm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const TrsmExpr<Scalar, UpLo>& expr) {
|
||||||
|
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||||
|
const DeviceMatrix<Scalar>& B = expr.rhs();
|
||||||
|
|
||||||
|
eigen_assert(A.rows() == A.cols() && "TRSM requires a square triangular matrix");
|
||||||
|
eigen_assert(B.rows() == A.rows() && "TRSM: RHS rows must match matrix size");
|
||||||
|
|
||||||
|
const int n = static_cast<int>(A.rows());
|
||||||
|
const int nrhs = static_cast<int>(B.cols());
|
||||||
|
|
||||||
|
if (n == 0 || nrhs == 0) {
|
||||||
|
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||||
|
dst.resize(n == 0 ? 0 : n, B.cols());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
A.waitReady(ctx.stream());
|
||||||
|
B.waitReady(ctx.stream());
|
||||||
|
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||||
|
|
||||||
|
// D2D copy B → dst (trsm is in-place on the RHS).
|
||||||
|
dst.resize(n, B.cols());
|
||||||
|
const size_t rhs_bytes = static_cast<size_t>(dst.rows()) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
|
||||||
|
|
||||||
|
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
||||||
|
Scalar alpha(1);
|
||||||
|
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs,
|
||||||
|
&alpha, A.data(), static_cast<int>(A.rows()), dst.data(),
|
||||||
|
static_cast<int>(dst.rows())));
|
||||||
|
|
||||||
|
dst.recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SYMM/HEMM dispatch -----------------------------------------------------
|
||||||
|
// SymmExpr → cublasXsymm (real) or cublasXhemm (complex).
|
||||||
|
// C = A * B where A is symmetric/Hermitian. Side=Left.
|
||||||
|
|
||||||
|
template <typename Scalar, int UpLo>
|
||||||
|
void dispatch_symm(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SymmExpr<Scalar, UpLo>& expr) {
|
||||||
|
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||||
|
const DeviceMatrix<Scalar>& B = expr.rhs();
|
||||||
|
|
||||||
|
eigen_assert(A.rows() == A.cols() && "SYMM requires a square matrix");
|
||||||
|
eigen_assert(B.rows() == A.rows() && "SYMM: RHS rows must match matrix size");
|
||||||
|
|
||||||
|
const int m = static_cast<int>(A.rows());
|
||||||
|
const int n = static_cast<int>(B.cols());
|
||||||
|
|
||||||
|
if (m == 0 || n == 0) {
|
||||||
|
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||||
|
dst.resize(m == 0 ? 0 : m, B.cols());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
A.waitReady(ctx.stream());
|
||||||
|
B.waitReady(ctx.stream());
|
||||||
|
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||||
|
|
||||||
|
dst.resize(m, n);
|
||||||
|
|
||||||
|
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
||||||
|
Scalar alpha(1), beta(0);
|
||||||
|
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &alpha, A.data(),
|
||||||
|
static_cast<int>(A.rows()), B.data(), static_cast<int>(B.rows()), &beta, dst.data(),
|
||||||
|
static_cast<int>(dst.rows())));
|
||||||
|
|
||||||
|
dst.recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SYRK/HERK dispatch -----------------------------------------------------
|
||||||
|
// SyrkExpr → cublasXsyrk (real) or cublasXherk (complex).
|
||||||
|
// C = alpha * A * A^H + beta * C. UpLo specifies which triangle of C is stored.
|
||||||
|
|
||||||
|
template <typename Scalar, int UpLo>
|
||||||
|
void dispatch_syrk(GpuContext& ctx, DeviceMatrix<Scalar>& dst, const SyrkExpr<Scalar, UpLo>& expr,
|
||||||
|
typename NumTraits<Scalar>::Real alpha_val, typename NumTraits<Scalar>::Real beta_val) {
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
const DeviceMatrix<Scalar>& A = expr.matrix();
|
||||||
|
|
||||||
|
const int n = static_cast<int>(A.rows());
|
||||||
|
const int k = static_cast<int>(A.cols());
|
||||||
|
|
||||||
|
if (n == 0) {
|
||||||
|
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||||
|
dst.resize(0, 0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
A.waitReady(ctx.stream());
|
||||||
|
if (!dst.empty()) dst.waitReady(ctx.stream());
|
||||||
|
|
||||||
|
if (dst.empty() || dst.rows() != n || dst.cols() != n) {
|
||||||
|
dst.resize(n, n);
|
||||||
|
if (beta_val != RealScalar(0)) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
|
||||||
|
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(),
|
||||||
|
static_cast<int>(A.rows()), &beta_val, dst.data(), static_cast<int>(dst.rows())));
|
||||||
|
|
||||||
|
dst.recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
|
||||||
|
// ---- DeviceAssignment: d_C.device(ctx) = expr ------------------------------
|
||||||
|
// Returned by DeviceMatrix::device(ctx). Dispatches expressions to library calls.
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class DeviceAssignment {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
|
||||||
|
DeviceAssignment(DeviceMatrix<Scalar>& dst, GpuContext& ctx) : dst_(dst), ctx_(ctx) {}
|
||||||
|
|
||||||
|
// operator= dispatches GEMM with beta=0 (overwrite).
|
||||||
|
template <typename Lhs, typename Rhs>
|
||||||
|
DeviceMatrix<Scalar>& operator=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||||
|
internal::dispatch_gemm(ctx_, dst_, expr, Scalar(0));
|
||||||
|
return dst_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// operator+= dispatches GEMM with beta=1 (accumulate).
|
||||||
|
template <typename Lhs, typename Rhs>
|
||||||
|
DeviceMatrix<Scalar>& operator+=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||||
|
internal::dispatch_gemm(ctx_, dst_, expr, Scalar(1));
|
||||||
|
return dst_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// operator-= dispatches GEMM with negated alpha, beta=1: C = C - alpha*op(A)*op(B).
|
||||||
|
template <typename Lhs, typename Rhs>
|
||||||
|
DeviceMatrix<Scalar>& operator-=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||||
|
internal::dispatch_gemm(ctx_, dst_, expr, Scalar(1), Scalar(-1));
|
||||||
|
return dst_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// operator= dispatches LLT solve (potrf + potrs).
|
||||||
|
template <int UpLo>
|
||||||
|
DeviceMatrix<Scalar>& operator=(const LltSolveExpr<Scalar, UpLo>& expr) {
|
||||||
|
internal::dispatch_llt_solve(ctx_, dst_, expr);
|
||||||
|
return dst_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// operator= dispatches LU solve (getrf + getrs).
|
||||||
|
DeviceMatrix<Scalar>& operator=(const LuSolveExpr<Scalar>& expr) {
|
||||||
|
internal::dispatch_lu_solve(ctx_, dst_, expr);
|
||||||
|
return dst_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// operator= dispatches TRSM (triangular solve).
|
||||||
|
template <int UpLo>
|
||||||
|
DeviceMatrix<Scalar>& operator=(const TrsmExpr<Scalar, UpLo>& expr) {
|
||||||
|
internal::dispatch_trsm(ctx_, dst_, expr);
|
||||||
|
return dst_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// operator= dispatches SYMM/HEMM (symmetric/Hermitian multiply).
|
||||||
|
template <int UpLo>
|
||||||
|
DeviceMatrix<Scalar>& operator=(const SymmExpr<Scalar, UpLo>& expr) {
|
||||||
|
internal::dispatch_symm(ctx_, dst_, expr);
|
||||||
|
return dst_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Catch-all: static_assert for unsupported expressions.
|
||||||
|
template <typename Expr>
|
||||||
|
DeviceMatrix<Scalar>& operator=(const Expr&) {
|
||||||
|
static_assert(sizeof(Expr) == 0,
|
||||||
|
"DeviceMatrix expression not supported: no cuBLAS/cuSOLVER mapping. "
|
||||||
|
"Supported: GEMM (A*B), TRSM (.triangularView().solve()), "
|
||||||
|
"SYMM (.selfadjointView()*B), LLT (.llt().solve()), LU (.lu().solve()).");
|
||||||
|
return dst_;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
DeviceMatrix<Scalar>& dst_;
|
||||||
|
GpuContext& ctx_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- Out-of-line DeviceMatrix expression operator= definitions -------------
|
||||||
|
// These are declared in DeviceMatrix.h but defined here because they need
|
||||||
|
// GpuContext::threadLocal() which requires the full GpuContext definition.
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
template <typename Lhs, typename Rhs>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||||
|
device(GpuContext::threadLocal()) = expr;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
template <typename Lhs, typename Rhs>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const GemmExpr<Lhs, Rhs>& expr) {
|
||||||
|
device(GpuContext::threadLocal()) += expr;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
template <int UpLo>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const LltSolveExpr<Scalar_, UpLo>& expr) {
|
||||||
|
device(GpuContext::threadLocal()) = expr;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const LuSolveExpr<Scalar_>& expr) {
|
||||||
|
device(GpuContext::threadLocal()) = expr;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
template <int UpLo>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const TrsmExpr<Scalar_, UpLo>& expr) {
|
||||||
|
device(GpuContext::threadLocal()) = expr;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
template <int UpLo>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const SymmExpr<Scalar_, UpLo>& expr) {
|
||||||
|
device(GpuContext::threadLocal()) = expr;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeviceSelfAdjointView::rankUpdate — defined here because it needs GpuContext.
|
||||||
|
template <typename Scalar_, int UpLo_>
|
||||||
|
void DeviceSelfAdjointView<Scalar_, UpLo_>::rankUpdate(const DeviceMatrix<Scalar_>& A, RealScalar alpha) {
|
||||||
|
SyrkExpr<Scalar_, UpLo_> expr(A);
|
||||||
|
RealScalar beta = matrix().empty() ? RealScalar(0) : RealScalar(1);
|
||||||
|
internal::dispatch_syrk(GpuContext::threadLocal(), matrix(), expr, alpha, beta);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- DeviceMatrix BLAS-1 out-of-line definitions ----------------------------
|
||||||
|
// Defined here because they need the full GpuContext definition.
|
||||||
|
// All methods take an explicit GpuContext& so callers can ensure same-stream
|
||||||
|
// execution (zero event overhead when all operations share one context).
|
||||||
|
//
|
||||||
|
// Reduction methods (dot, norm, squaredNorm) use CUBLAS_POINTER_MODE_HOST:
|
||||||
|
// the scalar result is written to host memory and cuBLAS synchronizes
|
||||||
|
// internally before returning. This is necessary for Eigen template
|
||||||
|
// compatibility — CG does `Scalar alpha = absNew / p.dot(tmp)` which
|
||||||
|
// requires the host value immediately. A future GPU CG implementation
|
||||||
|
// that controls the iteration loop can use CUBLAS_POINTER_MODE_DEVICE
|
||||||
|
// to batch multiple reductions into a single sync point.
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceScalar<typename DeviceMatrix<Scalar_>::Scalar> DeviceMatrix<Scalar_>::dot(GpuContext& ctx,
|
||||||
|
const DeviceMatrix& other) const {
|
||||||
|
const int n = static_cast<int>(rows_ * cols_);
|
||||||
|
eigen_assert(n == static_cast<int>(other.rows_ * other.cols_));
|
||||||
|
DeviceScalar<Scalar> result(Scalar(0), ctx.stream());
|
||||||
|
if (n > 0) {
|
||||||
|
waitReady(ctx.stream());
|
||||||
|
other.waitReady(ctx.stream());
|
||||||
|
cublasPointerMode_t prev;
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
|
||||||
|
EIGEN_CUBLAS_CHECK(internal::cublasXdot(ctx.cublasHandle(), n, data_, 1, other.data_, 1, result.devicePtr()));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace internal {
|
||||||
|
// Real: dot(x,x) returns DeviceScalar<Scalar> which IS DeviceScalar<RealScalar>.
|
||||||
|
// Move-construct without any sync.
|
||||||
|
template <typename Scalar, typename RealScalar>
|
||||||
|
typename std::enable_if<std::is_same<Scalar, RealScalar>::value, DeviceScalar<RealScalar>>::type squaredNorm_from_dot(
|
||||||
|
DeviceScalar<Scalar>&& d, cudaStream_t) {
|
||||||
|
return std::move(d);
|
||||||
|
}
|
||||||
|
// Complex: must sync to extract the real part (DeviceScalar arithmetic is real-only).
|
||||||
|
template <typename Scalar, typename RealScalar>
|
||||||
|
typename std::enable_if<!std::is_same<Scalar, RealScalar>::value, DeviceScalar<RealScalar>>::type squaredNorm_from_dot(
|
||||||
|
DeviceScalar<Scalar>&& d, cudaStream_t stream) {
|
||||||
|
return DeviceScalar<RealScalar>(numext::real(Scalar(d)), stream);
|
||||||
|
}
|
||||||
|
} // namespace internal
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::squaredNorm(GpuContext& ctx) const {
|
||||||
|
// Use dot(x,x) instead of nrm2()^2: dot kernel is ~4.5x faster than nrm2
|
||||||
|
// (nrm2 uses a numerically careful scaled-sum-of-squares algorithm that is
|
||||||
|
// unnecessary for CG convergence checks).
|
||||||
|
using RealScalar = typename NumTraits<Scalar_>::Real;
|
||||||
|
return internal::squaredNorm_from_dot<Scalar_, RealScalar>(dot(ctx, *this), ctx.stream());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::norm(GpuContext& ctx) const {
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
const int n = static_cast<int>(rows_ * cols_);
|
||||||
|
DeviceScalar<RealScalar> result(RealScalar(0), ctx.stream());
|
||||||
|
if (n > 0) {
|
||||||
|
waitReady(ctx.stream());
|
||||||
|
cublasPointerMode_t prev;
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
|
||||||
|
EIGEN_CUBLAS_CHECK(internal::cublasXnrm2(ctx.cublasHandle(), n, data_, 1, result.devicePtr()));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
void DeviceMatrix<Scalar_>::setZero(GpuContext& ctx) {
|
||||||
|
if (sizeInBytes() > 0) {
|
||||||
|
waitReady(ctx.stream());
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(data_, 0, sizeInBytes(), ctx.stream()));
|
||||||
|
recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
void DeviceMatrix<Scalar_>::addScaled(GpuContext& ctx, Scalar alpha, const DeviceMatrix& x) {
|
||||||
|
const int n = static_cast<int>(rows_ * cols_);
|
||||||
|
eigen_assert(n == static_cast<int>(x.rows_ * x.cols_));
|
||||||
|
if (n > 0) {
|
||||||
|
waitReady(ctx.stream());
|
||||||
|
x.waitReady(ctx.stream());
|
||||||
|
EIGEN_CUBLAS_CHECK(internal::cublasXaxpy(ctx.cublasHandle(), n, &alpha, x.data_, 1, data_, 1));
|
||||||
|
recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
void DeviceMatrix<Scalar_>::scale(GpuContext& ctx, Scalar alpha) {
|
||||||
|
const int n = static_cast<int>(rows_ * cols_);
|
||||||
|
if (n > 0) {
|
||||||
|
waitReady(ctx.stream());
|
||||||
|
EIGEN_CUBLAS_CHECK(internal::cublasXscal(ctx.cublasHandle(), n, &alpha, data_, 1));
|
||||||
|
recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
void DeviceMatrix<Scalar_>::copyFrom(GpuContext& ctx, const DeviceMatrix& other) {
|
||||||
|
// Wait on *this before resize — resize may free the old buffer while another
|
||||||
|
// stream is still reading it.
|
||||||
|
if (!empty()) waitReady(ctx.stream());
|
||||||
|
resize(other.rows_, other.cols_);
|
||||||
|
const int n = static_cast<int>(rows_ * cols_);
|
||||||
|
if (n > 0) {
|
||||||
|
other.waitReady(ctx.stream());
|
||||||
|
EIGEN_CUBLAS_CHECK(internal::cublasXcopy(ctx.cublasHandle(), n, other.data_, 1, data_, 1));
|
||||||
|
recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- BLAS-1 operator overloads for CG compatibility -------------------------
|
||||||
|
|
||||||
|
// this += alpha * x (axpy)
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceScaled<DeviceMatrix>& expr) {
|
||||||
|
addScaled(GpuContext::threadLocal(), expr.scalar(), internal::device_expr_traits<DeviceMatrix>::matrix(expr.inner()));
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// this -= alpha * x (axpy with negated alpha)
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceScaled<DeviceMatrix>& expr) {
|
||||||
|
addScaled(GpuContext::threadLocal(), -expr.scalar(),
|
||||||
|
internal::device_expr_traits<DeviceMatrix>::matrix(expr.inner()));
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// this += x (axpy with alpha=1)
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceMatrix& other) {
|
||||||
|
Scalar one(1);
|
||||||
|
addScaled(GpuContext::threadLocal(), one, other);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// this -= x (axpy with alpha=-1)
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceMatrix& other) {
|
||||||
|
Scalar neg_one(-1);
|
||||||
|
addScaled(GpuContext::threadLocal(), neg_one, other);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// this *= alpha (scal, host pointer)
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator*=(Scalar alpha) {
|
||||||
|
scale(GpuContext::threadLocal(), alpha);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// this *= alpha (scal, device pointer — avoids host sync)
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator*=(const DeviceScalar<Scalar>& alpha) {
|
||||||
|
const int n = static_cast<int>(rows_ * cols_);
|
||||||
|
if (n > 0) {
|
||||||
|
auto& ctx = GpuContext::threadLocal();
|
||||||
|
waitReady(ctx.stream());
|
||||||
|
cublasPointerMode_t prev;
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
|
||||||
|
EIGEN_CUBLAS_CHECK(internal::cublasXscal(ctx.cublasHandle(), n, alpha.devicePtr(), data_, 1));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
|
||||||
|
recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// this += DeviceScalar * x (axpy with CUBLAS_POINTER_MODE_DEVICE)
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceScaledDevice<Scalar_>& expr) {
|
||||||
|
const int n = static_cast<int>(rows_ * cols_);
|
||||||
|
const auto& x = expr.matrix();
|
||||||
|
eigen_assert(n == static_cast<int>(x.rows_ * x.cols_));
|
||||||
|
if (n > 0) {
|
||||||
|
auto& ctx = GpuContext::threadLocal();
|
||||||
|
waitReady(ctx.stream());
|
||||||
|
x.waitReady(ctx.stream());
|
||||||
|
cublasPointerMode_t prev;
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
|
||||||
|
EIGEN_CUBLAS_CHECK(internal::cublasXaxpy(ctx.cublasHandle(), n, expr.alpha().devicePtr(), x.data_, 1, data_, 1));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
|
||||||
|
recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// this -= DeviceScalar * x (axpy with negated device scalar)
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceScaledDevice<Scalar_>& expr) {
|
||||||
|
auto neg_alpha = -expr.alpha();
|
||||||
|
DeviceScaledDevice<Scalar_> neg_expr(neg_alpha, expr.matrix());
|
||||||
|
return operator+=(neg_expr);
|
||||||
|
}
|
||||||
|
|
||||||
|
// this = alpha * A + beta * B (cuBLAS geam)
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const DeviceAddExpr<Scalar_>& expr) {
|
||||||
|
auto& ctx = GpuContext::threadLocal();
|
||||||
|
const auto& A = expr.A();
|
||||||
|
const auto& B = expr.B();
|
||||||
|
eigen_assert(A.rows() == B.rows() && A.cols() == B.cols());
|
||||||
|
const int m = static_cast<int>(A.rows());
|
||||||
|
const int n = static_cast<int>(A.cols());
|
||||||
|
// Wait on *this before resize — resize may free the old buffer while another
|
||||||
|
// stream is still reading it.
|
||||||
|
if (!empty()) waitReady(ctx.stream());
|
||||||
|
resize(A.rows(), A.cols());
|
||||||
|
if (m > 0 && n > 0) {
|
||||||
|
A.waitReady(ctx.stream());
|
||||||
|
B.waitReady(ctx.stream());
|
||||||
|
Scalar_ alpha = expr.alpha();
|
||||||
|
Scalar_ beta = expr.beta();
|
||||||
|
EIGEN_CUBLAS_CHECK(internal::cublasXgeam(ctx.cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, &alpha, A.data(), m,
|
||||||
|
&beta, B.data(), m, data_, m));
|
||||||
|
recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// cwiseProduct via NPP nppsMul (allocating).
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceMatrix<Scalar_> DeviceMatrix<Scalar_>::cwiseProduct(GpuContext& ctx, const DeviceMatrix& other) const {
|
||||||
|
const int n = static_cast<int>(rows_ * cols_);
|
||||||
|
eigen_assert(n == static_cast<int>(other.rows_ * other.cols_));
|
||||||
|
DeviceMatrix result(rows_, cols_);
|
||||||
|
if (n > 0) {
|
||||||
|
waitReady(ctx.stream());
|
||||||
|
other.waitReady(ctx.stream());
|
||||||
|
internal::device_cwiseProduct(data_, other.data_, result.data_, n, ctx.stream());
|
||||||
|
result.recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// In-place cwiseProduct: this = a .* b (reuses this buffer, no allocation).
|
||||||
|
template <typename Scalar_>
|
||||||
|
void DeviceMatrix<Scalar_>::cwiseProduct(GpuContext& ctx, const DeviceMatrix& a, const DeviceMatrix& b) {
|
||||||
|
const int n = static_cast<int>(a.rows_ * a.cols_);
|
||||||
|
eigen_assert(n == static_cast<int>(b.rows_ * b.cols_));
|
||||||
|
if (!empty()) waitReady(ctx.stream());
|
||||||
|
resize(a.rows_, a.cols_);
|
||||||
|
if (n > 0) {
|
||||||
|
a.waitReady(ctx.stream());
|
||||||
|
b.waitReady(ctx.stream());
|
||||||
|
internal::device_cwiseProduct(a.data_, b.data_, data_, n, ctx.stream());
|
||||||
|
recordReady(ctx.stream());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convenience overloads using thread-local default GpuContext.
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceScalar<typename DeviceMatrix<Scalar_>::Scalar> DeviceMatrix<Scalar_>::dot(const DeviceMatrix& other) const {
|
||||||
|
return dot(GpuContext::threadLocal(), other);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::squaredNorm() const {
|
||||||
|
return squaredNorm(GpuContext::threadLocal());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::norm() const {
|
||||||
|
return norm(GpuContext::threadLocal());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
void DeviceMatrix<Scalar_>::setZero() {
|
||||||
|
setZero(GpuContext::threadLocal());
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_DEVICE_DISPATCH_H
|
||||||
305
Eigen/src/GPU/DeviceExpr.h
Normal file
305
Eigen/src/GPU/DeviceExpr.h
Normal file
@@ -0,0 +1,305 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Lightweight expression types for DeviceMatrix operations.
|
||||||
|
//
|
||||||
|
// These are NOT Eigen expression templates. Each type maps 1:1 to a single
|
||||||
|
// NVIDIA library call (cuBLAS or cuSOLVER). There is no coefficient-level
|
||||||
|
// evaluation, no lazy fusion, no packet operations.
|
||||||
|
//
|
||||||
|
// Expression types:
|
||||||
|
// DeviceAdjointView<S> — d_A.adjoint() → marks ConjTrans for GEMM
|
||||||
|
// DeviceTransposeView<S> — d_A.transpose() → marks Trans for GEMM
|
||||||
|
// DeviceScaled<Expr> — alpha * expr → carries scalar factor
|
||||||
|
// GemmExpr<Lhs, Rhs> — lhs * rhs → dispatches to cublasXgemm
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_DEVICE_EXPR_H
|
||||||
|
#define EIGEN_GPU_DEVICE_EXPR_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuBlasSupport.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
// Forward declaration.
|
||||||
|
template <typename Scalar_>
|
||||||
|
class DeviceMatrix;
|
||||||
|
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// ---- Traits: extract operation info from expression types -------------------
|
||||||
|
|
||||||
|
// Default: a DeviceMatrix is NoTrans.
|
||||||
|
template <typename T>
|
||||||
|
struct device_expr_traits {
|
||||||
|
static constexpr bool is_device_expr = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct device_expr_traits<DeviceMatrix<Scalar>> {
|
||||||
|
using scalar_type = Scalar;
|
||||||
|
static constexpr GpuOp op = GpuOp::NoTrans;
|
||||||
|
static constexpr bool is_device_expr = true;
|
||||||
|
static const DeviceMatrix<Scalar>& matrix(const DeviceMatrix<Scalar>& x) { return x; }
|
||||||
|
static Scalar alpha(const DeviceMatrix<Scalar>&) { return Scalar(1); }
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
|
||||||
|
// ---- DeviceAdjointView: marks ConjTrans ------------------------------------
|
||||||
|
// Returned by DeviceMatrix::adjoint(). Maps to cublasXgemm transA/B = C.
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class DeviceAdjointView {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
explicit DeviceAdjointView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||||
|
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DeviceMatrix<Scalar>& mat_;
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace internal {
|
||||||
|
template <typename Scalar>
|
||||||
|
struct device_expr_traits<DeviceAdjointView<Scalar>> {
|
||||||
|
using scalar_type = Scalar;
|
||||||
|
static constexpr GpuOp op = GpuOp::ConjTrans;
|
||||||
|
static constexpr bool is_device_expr = true;
|
||||||
|
static const DeviceMatrix<Scalar>& matrix(const DeviceAdjointView<Scalar>& x) { return x.matrix(); }
|
||||||
|
static Scalar alpha(const DeviceAdjointView<Scalar>&) { return Scalar(1); }
|
||||||
|
};
|
||||||
|
} // namespace internal
|
||||||
|
|
||||||
|
// ---- DeviceTransposeView: marks Trans --------------------------------------
|
||||||
|
// Returned by DeviceMatrix::transpose(). Maps to cublasXgemm transA/B = T.
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class DeviceTransposeView {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
explicit DeviceTransposeView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||||
|
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DeviceMatrix<Scalar>& mat_;
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace internal {
|
||||||
|
template <typename Scalar>
|
||||||
|
struct device_expr_traits<DeviceTransposeView<Scalar>> {
|
||||||
|
using scalar_type = Scalar;
|
||||||
|
static constexpr GpuOp op = GpuOp::Trans;
|
||||||
|
static constexpr bool is_device_expr = true;
|
||||||
|
static const DeviceMatrix<Scalar>& matrix(const DeviceTransposeView<Scalar>& x) { return x.matrix(); }
|
||||||
|
static Scalar alpha(const DeviceTransposeView<Scalar>&) { return Scalar(1); }
|
||||||
|
};
|
||||||
|
} // namespace internal
|
||||||
|
|
||||||
|
// ---- DeviceScaled: alpha * expr --------------------------------------------
|
||||||
|
// Returned by operator*(Scalar, DeviceMatrix/View). Carries the scalar factor.
|
||||||
|
|
||||||
|
template <typename Inner>
|
||||||
|
class DeviceScaled {
|
||||||
|
public:
|
||||||
|
using Scalar = typename internal::device_expr_traits<Inner>::scalar_type;
|
||||||
|
DeviceScaled(Scalar alpha, const Inner& inner) : alpha_(alpha), inner_(inner) {}
|
||||||
|
Scalar scalar() const { return alpha_; }
|
||||||
|
const Inner& inner() const { return inner_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
Scalar alpha_;
|
||||||
|
const Inner& inner_;
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace internal {
|
||||||
|
template <typename Inner>
|
||||||
|
struct device_expr_traits<DeviceScaled<Inner>> {
|
||||||
|
using scalar_type = typename device_expr_traits<Inner>::scalar_type;
|
||||||
|
static constexpr GpuOp op = device_expr_traits<Inner>::op;
|
||||||
|
static constexpr bool is_device_expr = true;
|
||||||
|
static const DeviceMatrix<scalar_type>& matrix(const DeviceScaled<Inner>& x) {
|
||||||
|
return device_expr_traits<Inner>::matrix(x.inner());
|
||||||
|
}
|
||||||
|
static scalar_type alpha(const DeviceScaled<Inner>& x) {
|
||||||
|
return x.scalar() * device_expr_traits<Inner>::alpha(x.inner());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace internal
|
||||||
|
|
||||||
|
// ---- GemmExpr: lhs * rhs → cublasXgemm ------------------------------------
|
||||||
|
// Returned by operator*(lhs_expr, rhs_expr). Dispatches to cuBLAS GEMM.
|
||||||
|
|
||||||
|
template <typename Lhs, typename Rhs>
|
||||||
|
class GemmExpr {
|
||||||
|
public:
|
||||||
|
using Scalar = typename internal::device_expr_traits<Lhs>::scalar_type;
|
||||||
|
static_assert(std::is_same<Scalar, typename internal::device_expr_traits<Rhs>::scalar_type>::value,
|
||||||
|
"DeviceMatrix GEMM: LHS and RHS must have the same scalar type");
|
||||||
|
|
||||||
|
GemmExpr(const Lhs& lhs, const Rhs& rhs) : lhs_(lhs), rhs_(rhs) {}
|
||||||
|
const Lhs& lhs() const { return lhs_; }
|
||||||
|
const Rhs& rhs() const { return rhs_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Stored by reference. Expression objects must not outlive their operands.
|
||||||
|
// This is safe for the one-liner pattern (d_C = d_A * d_B) since all
|
||||||
|
// temporaries live until the semicolon.
|
||||||
|
const Lhs& lhs_;
|
||||||
|
const Rhs& rhs_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- Free operator* overloads that produce GemmExpr ------------------------
|
||||||
|
// These cover: DM*DM, Adj*DM, DM*Adj, Trans*DM, DM*Trans, Scaled*DM, etc.
|
||||||
|
|
||||||
|
// DeviceMatrix * DeviceMatrix
|
||||||
|
template <typename S>
|
||||||
|
GemmExpr<DeviceMatrix<S>, DeviceMatrix<S>> operator*(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
|
||||||
|
return {a, b};
|
||||||
|
}
|
||||||
|
|
||||||
|
// AdjointView * DeviceMatrix
|
||||||
|
template <typename S>
|
||||||
|
GemmExpr<DeviceAdjointView<S>, DeviceMatrix<S>> operator*(const DeviceAdjointView<S>& a, const DeviceMatrix<S>& b) {
|
||||||
|
return {a, b};
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeviceMatrix * AdjointView
|
||||||
|
template <typename S>
|
||||||
|
GemmExpr<DeviceMatrix<S>, DeviceAdjointView<S>> operator*(const DeviceMatrix<S>& a, const DeviceAdjointView<S>& b) {
|
||||||
|
return {a, b};
|
||||||
|
}
|
||||||
|
|
||||||
|
// TransposeView * DeviceMatrix
|
||||||
|
template <typename S>
|
||||||
|
GemmExpr<DeviceTransposeView<S>, DeviceMatrix<S>> operator*(const DeviceTransposeView<S>& a, const DeviceMatrix<S>& b) {
|
||||||
|
return {a, b};
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeviceMatrix * TransposeView
|
||||||
|
template <typename S>
|
||||||
|
GemmExpr<DeviceMatrix<S>, DeviceTransposeView<S>> operator*(const DeviceMatrix<S>& a, const DeviceTransposeView<S>& b) {
|
||||||
|
return {a, b};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scaled * DeviceMatrix
|
||||||
|
template <typename Inner, typename S>
|
||||||
|
GemmExpr<DeviceScaled<Inner>, DeviceMatrix<S>> operator*(const DeviceScaled<Inner>& a, const DeviceMatrix<S>& b) {
|
||||||
|
return {a, b};
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeviceMatrix * Scaled
|
||||||
|
template <typename S, typename Inner>
|
||||||
|
GemmExpr<DeviceMatrix<S>, DeviceScaled<Inner>> operator*(const DeviceMatrix<S>& a, const DeviceScaled<Inner>& b) {
|
||||||
|
return {a, b};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Scalar * DeviceMatrix / View → DeviceScaled ---------------------------
|
||||||
|
|
||||||
|
template <typename S>
|
||||||
|
DeviceScaled<DeviceMatrix<S>> operator*(S alpha, const DeviceMatrix<S>& m) {
|
||||||
|
return {alpha, m};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename S>
|
||||||
|
DeviceScaled<DeviceAdjointView<S>> operator*(S alpha, const DeviceAdjointView<S>& m) {
|
||||||
|
return {alpha, m};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename S>
|
||||||
|
DeviceScaled<DeviceTransposeView<S>> operator*(S alpha, const DeviceTransposeView<S>& m) {
|
||||||
|
return {alpha, m};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- DeviceScaledDevice: DeviceScalar * DeviceMatrix → device-pointer axpy ---
|
||||||
|
// Like DeviceScaled but carries a DeviceScalar (device pointer) instead of
|
||||||
|
// a host scalar. operator+= dispatches to cuBLAS axpy with POINTER_MODE_DEVICE.
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class DeviceScaledDevice {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
DeviceScaledDevice(const DeviceScalar<Scalar>& alpha, const DeviceMatrix<Scalar>& mat) : alpha_(alpha), mat_(mat) {}
|
||||||
|
const DeviceScalar<Scalar>& alpha() const { return alpha_; }
|
||||||
|
const DeviceMatrix<Scalar>& matrix() const { return mat_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DeviceScalar<Scalar>& alpha_;
|
||||||
|
const DeviceMatrix<Scalar>& mat_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// DeviceScalar * DeviceMatrix → DeviceScaledDevice
|
||||||
|
template <typename S>
|
||||||
|
DeviceScaledDevice<S> operator*(const DeviceScalar<S>& alpha, const DeviceMatrix<S>& m) {
|
||||||
|
return {alpha, m};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- DeviceAddExpr: a + b → cublasXgeam -------------------------------------
|
||||||
|
// Captures `DeviceMatrix + DeviceScaled<DeviceMatrix>` (and reverse).
|
||||||
|
// Dispatched to geam: C = alpha * A + beta * B.
|
||||||
|
//
|
||||||
|
// Note: These operator+/- overloads are intentionally free functions on
|
||||||
|
// DeviceMatrix, not Eigen expression templates. DeviceMatrix does not inherit
|
||||||
|
// from MatrixBase, so there is no ambiguity with Eigen's own operator+/-.
|
||||||
|
// If DeviceMatrix is ever made an Eigen expression type, these would need to
|
||||||
|
// be revisited.
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class DeviceAddExpr {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
DeviceAddExpr(Scalar alpha, const DeviceMatrix<Scalar>& A, Scalar beta, const DeviceMatrix<Scalar>& B)
|
||||||
|
: alpha_(alpha), A_(A), beta_(beta), B_(B) {}
|
||||||
|
Scalar alpha() const { return alpha_; }
|
||||||
|
Scalar beta() const { return beta_; }
|
||||||
|
const DeviceMatrix<Scalar>& A() const { return A_; }
|
||||||
|
const DeviceMatrix<Scalar>& B() const { return B_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
Scalar alpha_;
|
||||||
|
const DeviceMatrix<Scalar>& A_;
|
||||||
|
Scalar beta_;
|
||||||
|
const DeviceMatrix<Scalar>& B_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// DeviceMatrix + DeviceMatrix → DeviceAddExpr (alpha=1, beta=1)
|
||||||
|
template <typename S>
|
||||||
|
DeviceAddExpr<S> operator+(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
|
||||||
|
return {S(1), a, S(1), b};
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeviceMatrix + DeviceScaled<DeviceMatrix> → DeviceAddExpr (alpha=1, beta=scaled)
|
||||||
|
template <typename S>
|
||||||
|
DeviceAddExpr<S> operator+(const DeviceMatrix<S>& a, const DeviceScaled<DeviceMatrix<S>>& b) {
|
||||||
|
return {S(1), a, b.scalar(), b.inner()};
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeviceScaled<DeviceMatrix> + DeviceMatrix → DeviceAddExpr (alpha=scaled, beta=1)
|
||||||
|
template <typename S>
|
||||||
|
DeviceAddExpr<S> operator+(const DeviceScaled<DeviceMatrix<S>>& a, const DeviceMatrix<S>& b) {
|
||||||
|
return {a.scalar(), a.inner(), S(1), b};
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeviceMatrix - DeviceMatrix → DeviceAddExpr (alpha=1, beta=-1)
|
||||||
|
template <typename S>
|
||||||
|
DeviceAddExpr<S> operator-(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
|
||||||
|
return {S(1), a, S(-1), b};
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeviceMatrix - DeviceScaled<DeviceMatrix> → DeviceAddExpr (alpha=1, beta=-scaled)
|
||||||
|
template <typename S>
|
||||||
|
DeviceAddExpr<S> operator-(const DeviceMatrix<S>& a, const DeviceScaled<DeviceMatrix<S>>& b) {
|
||||||
|
return {S(1), a, -b.scalar(), b.inner()};
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_DEVICE_EXPR_H
|
||||||
623
Eigen/src/GPU/DeviceMatrix.h
Normal file
623
Eigen/src/GPU/DeviceMatrix.h
Normal file
@@ -0,0 +1,623 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Typed RAII wrapper for a dense matrix in GPU device memory.
|
||||||
|
//
|
||||||
|
// DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked
|
||||||
|
// dimensions. Always dense (leading dimension = rows). It can be passed to GPU solvers
|
||||||
|
// (GpuLLT, GpuLU, future cuBLAS/cuDSS) without host round-trips.
|
||||||
|
//
|
||||||
|
// Cross-stream safety is automatic: an internal CUDA event tracks when the
|
||||||
|
// last write completed. Consumers on a different stream wait on that event
|
||||||
|
// before reading.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// auto d_A = DeviceMatrix<double>::fromHost(A); // upload (sync)
|
||||||
|
// GpuLLT<double> llt;
|
||||||
|
// llt.compute(d_A); // factor on device
|
||||||
|
// auto d_X = llt.solve(d_B); // async, no sync
|
||||||
|
// MatrixXd X = d_X.toHost(); // download + block
|
||||||
|
//
|
||||||
|
// Async variants:
|
||||||
|
// auto d_A = DeviceMatrix<double>::fromHostAsync(A.data(), n, n, stream);
|
||||||
|
// auto transfer = d_X.toHostAsync(stream); // enqueue D2H
|
||||||
|
// // ... overlap with other work ...
|
||||||
|
// MatrixXd X = transfer.get(); // block + retrieve
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_DEVICE_MATRIX_H
|
||||||
|
#define EIGEN_GPU_DEVICE_MATRIX_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSupport.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
// Forward declarations.
|
||||||
|
template <typename, int>
|
||||||
|
class GpuLLT;
|
||||||
|
template <typename>
|
||||||
|
class GpuLU;
|
||||||
|
template <typename>
|
||||||
|
class DeviceAdjointView;
|
||||||
|
template <typename>
|
||||||
|
class DeviceTransposeView;
|
||||||
|
template <typename>
|
||||||
|
class DeviceAssignment;
|
||||||
|
template <typename, typename>
|
||||||
|
class GemmExpr;
|
||||||
|
template <typename>
|
||||||
|
class DeviceScaled;
|
||||||
|
template <typename>
|
||||||
|
class SpMVExpr;
|
||||||
|
template <typename>
|
||||||
|
class DeviceAddExpr;
|
||||||
|
template <typename>
|
||||||
|
class DeviceScaledDevice;
|
||||||
|
template <typename>
|
||||||
|
class DeviceScalar;
|
||||||
|
template <typename, int>
|
||||||
|
class LltSolveExpr;
|
||||||
|
template <typename>
|
||||||
|
class LuSolveExpr;
|
||||||
|
template <typename, int>
|
||||||
|
class DeviceLLTView;
|
||||||
|
template <typename>
|
||||||
|
class DeviceLUView;
|
||||||
|
template <typename, int>
|
||||||
|
class DeviceTriangularView;
|
||||||
|
template <typename, int>
|
||||||
|
class DeviceSelfAdjointView;
|
||||||
|
template <typename, int>
|
||||||
|
class ConstDeviceSelfAdjointView;
|
||||||
|
template <typename, int>
|
||||||
|
class TrsmExpr;
|
||||||
|
template <typename, int>
|
||||||
|
class SymmExpr;
|
||||||
|
template <typename, int>
|
||||||
|
class SyrkExpr;
|
||||||
|
class GpuContext;
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// HostTransfer — future-like wrapper for an async device-to-host transfer.
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/** \ingroup GPU_Module
|
||||||
|
* \class HostTransfer
|
||||||
|
* \brief Future for an asynchronous device-to-host matrix transfer.
|
||||||
|
*
|
||||||
|
* Returned by DeviceMatrix::toHostAsync(). The transfer runs asynchronously
|
||||||
|
* on the given CUDA stream. Call get() to block until complete and retrieve
|
||||||
|
* the host matrix, or ready() to poll without blocking.
|
||||||
|
*/
|
||||||
|
template <typename Scalar_>
|
||||||
|
class HostTransfer {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||||
|
|
||||||
|
/** Block until the transfer completes and return the host matrix.
|
||||||
|
* Idempotent: subsequent calls return the same matrix without re-syncing. */
|
||||||
|
PlainMatrix& get() {
|
||||||
|
if (!synced_) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaEventSynchronize(event_));
|
||||||
|
synced_ = true;
|
||||||
|
}
|
||||||
|
return host_buf_;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Non-blocking check: has the transfer completed? */
|
||||||
|
bool ready() const {
|
||||||
|
if (synced_) return true;
|
||||||
|
cudaError_t err = cudaEventQuery(event_);
|
||||||
|
if (err == cudaSuccess) return true;
|
||||||
|
eigen_assert(err == cudaErrorNotReady && "cudaEventQuery failed");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
~HostTransfer() {
|
||||||
|
if (event_) (void)cudaEventDestroy(event_);
|
||||||
|
}
|
||||||
|
|
||||||
|
HostTransfer(HostTransfer&& o) noexcept : host_buf_(std::move(o.host_buf_)), event_(o.event_), synced_(o.synced_) {
|
||||||
|
o.event_ = nullptr;
|
||||||
|
o.synced_ = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
HostTransfer& operator=(HostTransfer&& o) noexcept {
|
||||||
|
if (this != &o) {
|
||||||
|
if (event_) (void)cudaEventDestroy(event_);
|
||||||
|
host_buf_ = std::move(o.host_buf_);
|
||||||
|
event_ = o.event_;
|
||||||
|
synced_ = o.synced_;
|
||||||
|
o.event_ = nullptr;
|
||||||
|
o.synced_ = true;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
HostTransfer(const HostTransfer&) = delete;
|
||||||
|
HostTransfer& operator=(const HostTransfer&) = delete;
|
||||||
|
|
||||||
|
private:
|
||||||
|
template <typename>
|
||||||
|
friend class DeviceMatrix;
|
||||||
|
|
||||||
|
HostTransfer(PlainMatrix&& buf, cudaEvent_t event) : host_buf_(std::move(buf)), event_(event), synced_(false) {}
|
||||||
|
|
||||||
|
PlainMatrix host_buf_;
|
||||||
|
cudaEvent_t event_ = nullptr;
|
||||||
|
bool synced_ = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// DeviceMatrix — typed RAII wrapper for a dense matrix in device memory.
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/** \ingroup GPU_Module
|
||||||
|
* \class DeviceMatrix
|
||||||
|
* \brief RAII wrapper for a dense column-major matrix in GPU device memory.
|
||||||
|
*
|
||||||
|
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
|
||||||
|
*
|
||||||
|
* Owns a device allocation with tracked dimensions. Always dense
|
||||||
|
* (leading dimension = rows; no stride padding).
|
||||||
|
* An internal CUDA event records when the data was last written, enabling
|
||||||
|
* safe cross-stream consumption without user-visible synchronization.
|
||||||
|
*
|
||||||
|
* Each method has a synchronous and an asynchronous variant:
|
||||||
|
* - fromHost() / fromHostAsync(): upload from host
|
||||||
|
* - toHost() / toHostAsync(): download to host
|
||||||
|
*/
|
||||||
|
template <typename Scalar_>
|
||||||
|
class DeviceMatrix {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
using PlainObject = DeviceMatrix; // owning type (for CG template compatibility)
|
||||||
|
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||||
|
|
||||||
|
// ---- Construction / destruction ------------------------------------------
|
||||||
|
|
||||||
|
/** Default: empty (0x0, no allocation). */
|
||||||
|
DeviceMatrix() = default;
|
||||||
|
|
||||||
|
/** Allocate uninitialized column vector of given size.
|
||||||
|
* Matches Matrix<Scalar,Dynamic,1>(n) for CG template compatibility. */
|
||||||
|
explicit DeviceMatrix(Index n) : rows_(n), cols_(1) {
|
||||||
|
eigen_assert(n >= 0);
|
||||||
|
size_t bytes = sizeInBytes();
|
||||||
|
if (bytes > 0) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Allocate uninitialized device memory for a rows x cols matrix. */
|
||||||
|
DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols) {
|
||||||
|
eigen_assert(rows >= 0 && cols >= 0);
|
||||||
|
size_t bytes = sizeInBytes();
|
||||||
|
if (bytes > 0) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~DeviceMatrix() {
|
||||||
|
if (data_) (void)cudaFree(data_);
|
||||||
|
if (ready_event_) (void)cudaEventDestroy(ready_event_);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Move-only -----------------------------------------------------------
|
||||||
|
|
||||||
|
DeviceMatrix(DeviceMatrix&& o) noexcept
|
||||||
|
: data_(o.data_),
|
||||||
|
rows_(o.rows_),
|
||||||
|
cols_(o.cols_),
|
||||||
|
ready_event_(o.ready_event_),
|
||||||
|
ready_stream_(o.ready_stream_),
|
||||||
|
retained_buffer_(std::move(o.retained_buffer_)) {
|
||||||
|
o.data_ = nullptr;
|
||||||
|
o.rows_ = 0;
|
||||||
|
o.cols_ = 0;
|
||||||
|
o.ready_event_ = nullptr;
|
||||||
|
o.ready_stream_ = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
DeviceMatrix& operator=(DeviceMatrix&& o) noexcept {
|
||||||
|
if (this != &o) {
|
||||||
|
if (data_) (void)cudaFree(data_);
|
||||||
|
if (ready_event_) (void)cudaEventDestroy(ready_event_);
|
||||||
|
data_ = o.data_;
|
||||||
|
rows_ = o.rows_;
|
||||||
|
cols_ = o.cols_;
|
||||||
|
ready_event_ = o.ready_event_;
|
||||||
|
ready_stream_ = o.ready_stream_;
|
||||||
|
retained_buffer_ = std::move(o.retained_buffer_);
|
||||||
|
o.data_ = nullptr;
|
||||||
|
o.rows_ = 0;
|
||||||
|
o.cols_ = 0;
|
||||||
|
o.ready_event_ = nullptr;
|
||||||
|
o.ready_stream_ = nullptr;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
DeviceMatrix(const DeviceMatrix&) = delete;
|
||||||
|
DeviceMatrix& operator=(const DeviceMatrix&) = delete;
|
||||||
|
|
||||||
|
// ---- Upload from host ----------------------------------------------------
|
||||||
|
|
||||||
|
/** Upload a host Eigen matrix to device memory (synchronous).
|
||||||
|
*
|
||||||
|
* Evaluates the expression into a contiguous ColMajor temporary, copies to
|
||||||
|
* device via cudaMemcpyAsync on \p stream, and synchronizes before returning.
|
||||||
|
*
|
||||||
|
* \param host Any Eigen matrix expression.
|
||||||
|
* \param stream CUDA stream for the transfer (default: stream 0).
|
||||||
|
*/
|
||||||
|
template <typename Derived>
|
||||||
|
static DeviceMatrix fromHost(const MatrixBase<Derived>& host, cudaStream_t stream = nullptr) {
|
||||||
|
const PlainMatrix mat(host.derived());
|
||||||
|
DeviceMatrix dm(mat.rows(), mat.cols());
|
||||||
|
if (dm.sizeInBytes() > 0) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dm.data_, mat.data(), dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
|
||||||
|
}
|
||||||
|
return dm;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Upload from a raw host pointer to device memory (asynchronous).
|
||||||
|
*
|
||||||
|
* Enqueues an async H2D copy on \p stream and records an internal event.
|
||||||
|
* The caller must keep \p host_data alive until the transfer completes
|
||||||
|
* (check via the internal event or synchronize the stream).
|
||||||
|
*
|
||||||
|
* \param host_data Pointer to contiguous column-major host data.
|
||||||
|
* \param rows Number of rows.
|
||||||
|
* \param cols Number of columns.
|
||||||
|
* \param stream CUDA stream for the transfer.
|
||||||
|
*/
|
||||||
|
static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, cudaStream_t stream) {
|
||||||
|
eigen_assert(rows >= 0 && cols >= 0);
|
||||||
|
eigen_assert(host_data != nullptr || (rows == 0 || cols == 0));
|
||||||
|
DeviceMatrix dm(rows, cols);
|
||||||
|
if (dm.sizeInBytes() > 0) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dm.data_, host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
|
||||||
|
dm.recordReady(stream);
|
||||||
|
}
|
||||||
|
return dm;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Download to host ----------------------------------------------------
|
||||||
|
|
||||||
|
/** Download device matrix to host memory (synchronous).
|
||||||
|
*
|
||||||
|
* Waits on the internal ready event, enqueues a D2H copy on \p stream,
|
||||||
|
* synchronizes, and returns the host matrix directly.
|
||||||
|
*
|
||||||
|
* \param stream CUDA stream for the transfer (default: stream 0).
|
||||||
|
*/
|
||||||
|
PlainMatrix toHost(cudaStream_t stream = nullptr) const {
|
||||||
|
PlainMatrix host_buf(rows_, cols_);
|
||||||
|
if (sizeInBytes() > 0) {
|
||||||
|
waitReady(stream);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(host_buf.data(), data_, sizeInBytes(), cudaMemcpyDeviceToHost, stream));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
|
||||||
|
}
|
||||||
|
return host_buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Enqueue an async device-to-host transfer and return a future.
|
||||||
|
*
|
||||||
|
* Waits on the internal ready event (if any) to ensure the device data is
|
||||||
|
* valid, then enqueues the D2H copy on \p stream. Returns a HostTransfer
|
||||||
|
* future; call .get() to block and retrieve the host matrix.
|
||||||
|
*
|
||||||
|
* \param stream CUDA stream for the transfer (default: stream 0).
|
||||||
|
*/
|
||||||
|
HostTransfer<Scalar> toHostAsync(cudaStream_t stream = nullptr) const {
|
||||||
|
PlainMatrix host_buf(rows_, cols_);
|
||||||
|
if (sizeInBytes() > 0) {
|
||||||
|
waitReady(stream);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(host_buf.data(), data_, sizeInBytes(), cudaMemcpyDeviceToHost, stream));
|
||||||
|
}
|
||||||
|
// Record a transfer-complete event.
|
||||||
|
cudaEvent_t transfer_event;
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&transfer_event, cudaEventDisableTiming));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(transfer_event, stream));
|
||||||
|
return HostTransfer<Scalar>(std::move(host_buf), transfer_event);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Device-to-device copy -----------------------------------------------
|
||||||
|
|
||||||
|
/** Deep copy on device. Fully async — records event on the result, no sync.
|
||||||
|
*
|
||||||
|
* \param stream CUDA stream for the D2D copy (default: stream 0).
|
||||||
|
*/
|
||||||
|
DeviceMatrix clone(cudaStream_t stream = nullptr) const {
|
||||||
|
DeviceMatrix result(rows_, cols_);
|
||||||
|
if (sizeInBytes() > 0) {
|
||||||
|
waitReady(stream);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data_, data_, sizeInBytes(), cudaMemcpyDeviceToDevice, stream));
|
||||||
|
result.recordReady(stream);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Resize (destructive) ------------------------------------------------
|
||||||
|
|
||||||
|
/** Discard contents and reallocate to (rows x cols). Clears the ready event. */
|
||||||
|
void resize(Index rows, Index cols) {
|
||||||
|
if (rows == rows_ && cols == cols_) return;
|
||||||
|
if (data_) {
|
||||||
|
(void)cudaFree(data_);
|
||||||
|
data_ = nullptr;
|
||||||
|
}
|
||||||
|
if (ready_event_) {
|
||||||
|
(void)cudaEventDestroy(ready_event_);
|
||||||
|
ready_event_ = nullptr;
|
||||||
|
}
|
||||||
|
ready_stream_ = nullptr;
|
||||||
|
retained_buffer_ = internal::DeviceBuffer();
|
||||||
|
rows_ = rows;
|
||||||
|
cols_ = cols;
|
||||||
|
size_t bytes = sizeInBytes();
|
||||||
|
if (bytes > 0) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors -----------------------------------------------------------
|
||||||
|
|
||||||
|
Scalar* data() { return data_; }
|
||||||
|
const Scalar* data() const { return data_; }
|
||||||
|
Index rows() const { return rows_; }
|
||||||
|
Index cols() const { return cols_; }
|
||||||
|
bool empty() const { return rows_ == 0 || cols_ == 0; }
|
||||||
|
|
||||||
|
/** Size of the device allocation in bytes. */
|
||||||
|
size_t sizeInBytes() const { return static_cast<size_t>(rows_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
|
||||||
|
|
||||||
|
// ---- Event synchronization (public for library dispatch interop) ---------
|
||||||
|
|
||||||
|
/** Record that device data is ready after work on \p stream. */
|
||||||
|
void recordReady(cudaStream_t stream) {
|
||||||
|
ensureEvent();
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(ready_event_, stream));
|
||||||
|
ready_stream_ = stream;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Make \p stream wait until the device data is ready.
|
||||||
|
* No-op if no event recorded, or if the consumer stream is the same as the
|
||||||
|
* producer stream (CUDA guarantees in-order execution within a stream). */
|
||||||
|
void waitReady(cudaStream_t stream) const {
|
||||||
|
if (ready_event_ && stream != ready_stream_) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamWaitEvent(stream, ready_event_, 0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Expression methods (dispatch to cuBLAS/cuSOLVER) --------------------
|
||||||
|
|
||||||
|
/** Adjoint view for GEMM dispatch. Maps to cublasXgemm with ConjTrans. */
|
||||||
|
DeviceAdjointView<Scalar> adjoint() const { return DeviceAdjointView<Scalar>(*this); }
|
||||||
|
|
||||||
|
/** Transpose view for GEMM dispatch. Maps to cublasXgemm with Trans. */
|
||||||
|
DeviceTransposeView<Scalar> transpose() const { return DeviceTransposeView<Scalar>(*this); }
|
||||||
|
|
||||||
|
/** Bind this matrix to a GpuContext for expression assignment.
|
||||||
|
* Returns a DeviceAssignment proxy: `d_C.device(ctx) = d_A * d_B;` */
|
||||||
|
DeviceAssignment<Scalar> device(GpuContext& ctx) { return DeviceAssignment<Scalar>(*this, ctx); }
|
||||||
|
|
||||||
|
/** Assign from a GEMM expression using the thread-local default GpuContext.
|
||||||
|
* Defined out-of-line after GpuContext is fully declared (see DeviceDispatch.h). */
|
||||||
|
template <typename Lhs, typename Rhs>
|
||||||
|
DeviceMatrix& operator=(const GemmExpr<Lhs, Rhs>& expr);
|
||||||
|
|
||||||
|
/** Accumulate from a GEMM expression using the thread-local default GpuContext. */
|
||||||
|
template <typename Lhs, typename Rhs>
|
||||||
|
DeviceMatrix& operator+=(const GemmExpr<Lhs, Rhs>& expr);
|
||||||
|
|
||||||
|
/** Cholesky view: d_A.llt().solve(d_B) → LltSolveExpr. */
|
||||||
|
DeviceLLTView<Scalar, Lower> llt() const { return DeviceLLTView<Scalar, Lower>(*this); }
|
||||||
|
|
||||||
|
/** Cholesky view with explicit triangle: d_A.llt<Upper>().solve(d_B). */
|
||||||
|
template <int UpLo>
|
||||||
|
DeviceLLTView<Scalar, UpLo> llt() const {
|
||||||
|
return DeviceLLTView<Scalar, UpLo>(*this);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** LU view: d_A.lu().solve(d_B) → LuSolveExpr. */
|
||||||
|
DeviceLUView<Scalar> lu() const { return DeviceLUView<Scalar>(*this); }
|
||||||
|
|
||||||
|
/** Assign from an LLT solve expression (thread-local default context). */
|
||||||
|
template <int UpLo>
|
||||||
|
DeviceMatrix& operator=(const LltSolveExpr<Scalar, UpLo>& expr);
|
||||||
|
|
||||||
|
/** Assign from an LU solve expression (thread-local default context). */
|
||||||
|
DeviceMatrix& operator=(const LuSolveExpr<Scalar>& expr);
|
||||||
|
|
||||||
|
/** Triangular view: d_A.triangularView<Lower>().solve(d_B) → TrsmExpr. */
|
||||||
|
template <int UpLo>
|
||||||
|
DeviceTriangularView<Scalar, UpLo> triangularView() const {
|
||||||
|
return DeviceTriangularView<Scalar, UpLo>(*this);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Self-adjoint view (mutable): d_C.selfadjointView<Lower>().rankUpdate(d_A). */
|
||||||
|
template <int UpLo>
|
||||||
|
DeviceSelfAdjointView<Scalar, UpLo> selfadjointView() {
|
||||||
|
return DeviceSelfAdjointView<Scalar, UpLo>(*this);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Self-adjoint view (const): d_A.selfadjointView<Lower>() * d_B → SymmExpr. */
|
||||||
|
template <int UpLo>
|
||||||
|
ConstDeviceSelfAdjointView<Scalar, UpLo> selfadjointView() const {
|
||||||
|
return ConstDeviceSelfAdjointView<Scalar, UpLo>(*this);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Assign from a TRSM expression (thread-local default context). */
|
||||||
|
template <int UpLo>
|
||||||
|
DeviceMatrix& operator=(const TrsmExpr<Scalar, UpLo>& expr);
|
||||||
|
|
||||||
|
/** Assign from a SYMM expression (thread-local default context). */
|
||||||
|
template <int UpLo>
|
||||||
|
DeviceMatrix& operator=(const SymmExpr<Scalar, UpLo>& expr);
|
||||||
|
|
||||||
|
// ---- BLAS Level-1 operations ----------------------------------------------
|
||||||
|
// DeviceMatrix is always dense (lda == rows), so a vector is simply a
|
||||||
|
// DeviceMatrix with cols == 1. These BLAS-1 methods operate on the flat
|
||||||
|
// rows*cols element array, making them work for both vectors and matrices.
|
||||||
|
//
|
||||||
|
// All methods take an explicit GpuContext& for stream/handle control.
|
||||||
|
// When everything uses the same context, event waits are skipped (same-stream).
|
||||||
|
// Defined out-of-line in DeviceDispatch.h (needs GpuContext).
|
||||||
|
|
||||||
|
/** Dot product: this^H * other. Returns DeviceScalar — the result stays
|
||||||
|
* on device until read via implicit conversion to Scalar (which syncs).
|
||||||
|
* When used with `auto`, no sync occurs until the value is needed. */
|
||||||
|
DeviceScalar<Scalar> dot(GpuContext& ctx, const DeviceMatrix& other) const;
|
||||||
|
|
||||||
|
/** Squared L2 norm via dot(x, x). Returns DeviceScalar (no sync until read).
|
||||||
|
* For real types, the result stays on device. For complex types, falls back
|
||||||
|
* to host sync (DeviceScalar arithmetic is real-only). */
|
||||||
|
DeviceScalar<typename NumTraits<Scalar>::Real> squaredNorm(GpuContext& ctx) const;
|
||||||
|
|
||||||
|
/** L2 norm. Returns DeviceScalar (no host sync). */
|
||||||
|
DeviceScalar<typename NumTraits<Scalar>::Real> norm(GpuContext& ctx) const;
|
||||||
|
|
||||||
|
/** Set all elements to zero. */
|
||||||
|
void setZero(GpuContext& ctx);
|
||||||
|
|
||||||
|
/** this += alpha * x (cuBLAS axpy). Requires same total size. */
|
||||||
|
void addScaled(GpuContext& ctx, Scalar alpha, const DeviceMatrix& x);
|
||||||
|
|
||||||
|
/** this *= alpha (cuBLAS scal). */
|
||||||
|
void scale(GpuContext& ctx, Scalar alpha);
|
||||||
|
|
||||||
|
/** Deep copy: this = other (cuBLAS copy). Resizes if needed. */
|
||||||
|
void copyFrom(GpuContext& ctx, const DeviceMatrix& other);
|
||||||
|
|
||||||
|
// Convenience overloads using the thread-local default GpuContext.
|
||||||
|
DeviceScalar<Scalar> dot(const DeviceMatrix& other) const;
|
||||||
|
DeviceScalar<typename NumTraits<Scalar>::Real> squaredNorm() const;
|
||||||
|
DeviceScalar<typename NumTraits<Scalar>::Real> norm() const;
|
||||||
|
void setZero();
|
||||||
|
|
||||||
|
// ---- BLAS-1 operator overloads for CG/iterative solver compatibility ------
|
||||||
|
// These allow CG code like `x += alpha * p` to work with DeviceMatrix.
|
||||||
|
// `alpha * DeviceMatrix` already returns `DeviceScaled<DeviceMatrix<Scalar>>`
|
||||||
|
// (defined in DeviceExpr.h). These operators dispatch to cuBLAS axpy/scal.
|
||||||
|
// Defined out-of-line in DeviceDispatch.h.
|
||||||
|
|
||||||
|
/** this += alpha * x (cuBLAS axpy). For `x += alpha * p`. */
|
||||||
|
DeviceMatrix& operator+=(const DeviceScaled<DeviceMatrix>& expr);
|
||||||
|
|
||||||
|
/** this -= alpha * x (cuBLAS axpy with negated alpha). For `r -= alpha * tmp`. */
|
||||||
|
DeviceMatrix& operator-=(const DeviceScaled<DeviceMatrix>& expr);
|
||||||
|
|
||||||
|
/** this += x (cuBLAS axpy with alpha=1). */
|
||||||
|
DeviceMatrix& operator+=(const DeviceMatrix& other);
|
||||||
|
|
||||||
|
/** this -= x (cuBLAS axpy with alpha=-1). */
|
||||||
|
DeviceMatrix& operator-=(const DeviceMatrix& other);
|
||||||
|
|
||||||
|
/** this *= alpha (cuBLAS scal, host pointer mode). For `p *= beta`. */
|
||||||
|
DeviceMatrix& operator*=(Scalar alpha);
|
||||||
|
|
||||||
|
/** this *= alpha (cuBLAS scal, device pointer mode). Avoids host sync. */
|
||||||
|
DeviceMatrix& operator*=(const DeviceScalar<Scalar>& alpha);
|
||||||
|
|
||||||
|
/** Element-wise product: result[i] = this[i] * other[i] (NPP nppsMul).
|
||||||
|
* Returns a new DeviceMatrix. Defined out-of-line in DeviceDispatch.h. */
|
||||||
|
DeviceMatrix cwiseProduct(GpuContext& ctx, const DeviceMatrix& other) const;
|
||||||
|
|
||||||
|
/** In-place element-wise product: this[i] = a[i] * b[i] (NPP nppsMul).
|
||||||
|
* Reuses this matrix's buffer when sizes match, avoiding cudaMalloc. */
|
||||||
|
void cwiseProduct(GpuContext& ctx, const DeviceMatrix& a, const DeviceMatrix& b);
|
||||||
|
|
||||||
|
/** this += DeviceScalar * x (cuBLAS axpy with POINTER_MODE_DEVICE). */
|
||||||
|
DeviceMatrix& operator+=(const DeviceScaledDevice<Scalar>& expr);
|
||||||
|
|
||||||
|
/** this -= DeviceScalar * x (cuBLAS axpy with negated device scalar). */
|
||||||
|
DeviceMatrix& operator-=(const DeviceScaledDevice<Scalar>& expr);
|
||||||
|
|
||||||
|
/** Assign from an SpMV expression: d_y = d_A * d_x. */
|
||||||
|
DeviceMatrix& operator=(const SpMVExpr<Scalar>& expr);
|
||||||
|
|
||||||
|
/** Assign from an add expression: d_C = alpha * d_A + beta * d_B (cuBLAS geam). */
|
||||||
|
DeviceMatrix& operator=(const DeviceAddExpr<Scalar>& expr);
|
||||||
|
|
||||||
|
/** No-op — all DeviceMatrix operations are implicitly noalias.
|
||||||
|
*
|
||||||
|
* Unlike Eigen's Matrix, where omitting .noalias() triggers a copy to a
|
||||||
|
* temporary for safety, DeviceMatrix dispatches directly to NVIDIA library
|
||||||
|
* calls which have no built-in aliasing protection. Every assignment
|
||||||
|
* (`d_C = d_A * d_B`, `d_y = d_A * d_x`, etc.) behaves as if .noalias()
|
||||||
|
* were specified. The caller must ensure operands don't alias the
|
||||||
|
* destination for GEMM and SpMV. geam (`d_C = d_A + alpha * d_B`) is
|
||||||
|
* safe with aliasing. Debug asserts catch violations.
|
||||||
|
*
|
||||||
|
* This method exists so that `tmp.noalias() = mat * p` compiles for both
|
||||||
|
* Matrix and DeviceMatrix. */
|
||||||
|
DeviceMatrix& noalias() { return *this; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
// ---- Private: adopt a raw device pointer (used by friend solvers) --------
|
||||||
|
|
||||||
|
DeviceMatrix(Scalar* device_ptr, Index rows, Index cols) : data_(device_ptr), rows_(rows), cols_(cols) {}
|
||||||
|
|
||||||
|
/** Transfer ownership of the device pointer out. Zeros internal state. */
|
||||||
|
Scalar* release() {
|
||||||
|
Scalar* p = data_;
|
||||||
|
data_ = nullptr;
|
||||||
|
rows_ = 0;
|
||||||
|
cols_ = 0;
|
||||||
|
if (ready_event_) {
|
||||||
|
(void)cudaEventDestroy(ready_event_);
|
||||||
|
ready_event_ = nullptr;
|
||||||
|
}
|
||||||
|
ready_stream_ = nullptr;
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Private helpers -------------------------------------------------------
|
||||||
|
|
||||||
|
void ensureEvent() {
|
||||||
|
if (!ready_event_) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&ready_event_, cudaEventDisableTiming));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void retainBuffer(internal::DeviceBuffer&& buffer) { retained_buffer_ = std::move(buffer); }
|
||||||
|
|
||||||
|
// ---- Friend declarations ------------------------------------------------
|
||||||
|
|
||||||
|
template <typename, int>
|
||||||
|
friend class GpuLLT;
|
||||||
|
template <typename>
|
||||||
|
friend class GpuLU;
|
||||||
|
template <typename>
|
||||||
|
friend class GpuQR;
|
||||||
|
template <typename>
|
||||||
|
friend class GpuSVD;
|
||||||
|
template <typename>
|
||||||
|
friend class GpuSelfAdjointEigenSolver;
|
||||||
|
|
||||||
|
// ---- Data members --------------------------------------------------------
|
||||||
|
|
||||||
|
Scalar* data_ = nullptr;
|
||||||
|
Index rows_ = 0;
|
||||||
|
Index cols_ = 0;
|
||||||
|
cudaEvent_t ready_event_ = nullptr; // internal: tracks last write completion
|
||||||
|
cudaStream_t ready_stream_ = nullptr; // stream that recorded ready_event_ (for same-stream skip)
|
||||||
|
internal::DeviceBuffer retained_buffer_; // internal: keeps async aux buffers alive
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_DEVICE_MATRIX_H
|
||||||
121
Eigen/src/GPU/DeviceScalar.h
Normal file
121
Eigen/src/GPU/DeviceScalar.h
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Device-resident scalar for deferred host synchronization.
|
||||||
|
//
|
||||||
|
// DeviceScalar<Scalar> wraps a single value in device memory. Reductions
|
||||||
|
// (dot, nrm2) write results directly to device memory via
|
||||||
|
// CUBLAS_POINTER_MODE_DEVICE, deferring host sync until the value is read.
|
||||||
|
//
|
||||||
|
// Implicit conversion to Scalar triggers cudaStreamSynchronize + download.
|
||||||
|
// In CG, this reduces 3 syncs/iter to effectively 1: the first conversion
|
||||||
|
// syncs the stream, subsequent conversions in the same expression just
|
||||||
|
// download (the stream is already flushed).
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// auto dot_val = d_x.dot(d_y); // DeviceScalar, no sync
|
||||||
|
// auto norm_val = d_r.squaredNorm(); // DeviceScalar, no sync
|
||||||
|
// Scalar alpha = absNew / dot_val; // sync here (both values downloaded)
|
||||||
|
// d_x += alpha * d_p; // host-scalar axpy (as before)
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_DEVICE_SCALAR_H
|
||||||
|
#define EIGEN_GPU_DEVICE_SCALAR_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSupport.h"
|
||||||
|
#include "./DeviceScalarOps.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class DeviceScalar {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
|
||||||
|
/** Allocate uninitialized device scalar. Contents are undefined until written
|
||||||
|
* (e.g., by cuBLAS dot/nrm2 with POINTER_MODE_DEVICE). Consistent with
|
||||||
|
* DeviceMatrix(rows, cols) which also does not zero-initialize. */
|
||||||
|
explicit DeviceScalar(cudaStream_t stream = nullptr) : d_val_(sizeof(Scalar)), stream_(stream) {}
|
||||||
|
|
||||||
|
DeviceScalar(Scalar host_val, cudaStream_t stream) : d_val_(sizeof(Scalar)), stream_(stream) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_val_.ptr, &host_val, sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
DeviceScalar(DeviceScalar&& o) noexcept : d_val_(std::move(o.d_val_)), stream_(o.stream_) { o.stream_ = nullptr; }
|
||||||
|
|
||||||
|
DeviceScalar& operator=(DeviceScalar&& o) noexcept {
|
||||||
|
if (this != &o) {
|
||||||
|
d_val_ = std::move(o.d_val_);
|
||||||
|
stream_ = o.stream_;
|
||||||
|
o.stream_ = nullptr;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
DeviceScalar(const DeviceScalar&) = delete;
|
||||||
|
DeviceScalar& operator=(const DeviceScalar&) = delete;
|
||||||
|
|
||||||
|
/** Download from device. Synchronizes the stream on first call;
|
||||||
|
* subsequent calls in the same expression are cheap (stream already flushed). */
|
||||||
|
Scalar get() const {
|
||||||
|
Scalar result;
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&result, d_val_.ptr, sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Implicit conversion — allows `Scalar alpha = deviceScalar` and
|
||||||
|
* `if (deviceScalar < threshold)`. Triggers sync. */
|
||||||
|
operator Scalar() const { return get(); }
|
||||||
|
|
||||||
|
Scalar* devicePtr() { return static_cast<Scalar*>(d_val_.ptr); }
|
||||||
|
const Scalar* devicePtr() const { return static_cast<const Scalar*>(d_val_.ptr); }
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
// ---- Device-side arithmetic (no host sync) ---------------------------------
|
||||||
|
// Uses NPP from DeviceScalarOps.h. All results stay on device.
|
||||||
|
// Currently supports real types only (float, double). Complex types
|
||||||
|
// fall back to implicit conversion (host sync) for division.
|
||||||
|
//
|
||||||
|
// Note: DeviceScalar has no cross-stream readiness tracking. All
|
||||||
|
// operations must be on the same CUDA stream. This is the natural
|
||||||
|
// pattern in iterative solvers where one GpuContext owns all work.
|
||||||
|
|
||||||
|
friend DeviceScalar operator/(const DeviceScalar& a, const DeviceScalar& b) {
|
||||||
|
DeviceScalar result(a.stream_);
|
||||||
|
internal::device_scalar_div(a.devicePtr(), b.devicePtr(), result.devicePtr(), a.stream_);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
friend DeviceScalar operator/(Scalar a, const DeviceScalar& b) {
|
||||||
|
DeviceScalar d_a(a, b.stream_);
|
||||||
|
return d_a / b;
|
||||||
|
}
|
||||||
|
|
||||||
|
friend DeviceScalar operator/(const DeviceScalar& a, Scalar b) {
|
||||||
|
DeviceScalar d_b(b, a.stream_);
|
||||||
|
return a / d_b;
|
||||||
|
}
|
||||||
|
|
||||||
|
DeviceScalar operator-() const {
|
||||||
|
DeviceScalar result(stream_);
|
||||||
|
internal::device_scalar_neg(devicePtr(), result.devicePtr(), stream_);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
internal::DeviceBuffer d_val_;
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_DEVICE_SCALAR_H
|
||||||
117
Eigen/src/GPU/DeviceScalarOps.h
Normal file
117
Eigen/src/GPU/DeviceScalarOps.h
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Device-resident scalar and element-wise operations via NPP signals.
|
||||||
|
// Header-only — no custom CUDA kernels needed. Uses nppsDiv, nppsMul,
|
||||||
|
// nppsMulC from the NPP library (CUDA::npps, part of the CUDA toolkit).
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_DEVICE_SCALAR_OPS_H
|
||||||
|
#define EIGEN_GPU_DEVICE_SCALAR_OPS_H
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <npps_arithmetic_and_logical_operations.h>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// ---- NppStreamContext helper ------------------------------------------------
|
||||||
|
|
||||||
|
inline NppStreamContext make_npp_stream_ctx(cudaStream_t stream) {
|
||||||
|
// Cache device attributes (constant for process lifetime) in a thread-local.
|
||||||
|
// Only the stream and its flags vary per call.
|
||||||
|
struct CachedDeviceInfo {
|
||||||
|
bool initialized = false;
|
||||||
|
int device_id = 0;
|
||||||
|
int cc_major = 0;
|
||||||
|
int cc_minor = 0;
|
||||||
|
int mp_count = 0;
|
||||||
|
int max_threads_per_mp = 0;
|
||||||
|
int max_threads_per_block = 0;
|
||||||
|
int shared_mem_per_block = 0;
|
||||||
|
|
||||||
|
void init() {
|
||||||
|
if (initialized) return;
|
||||||
|
cudaGetDevice(&device_id);
|
||||||
|
cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device_id);
|
||||||
|
cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device_id);
|
||||||
|
cudaDeviceGetAttribute(&mp_count, cudaDevAttrMultiProcessorCount, device_id);
|
||||||
|
cudaDeviceGetAttribute(&max_threads_per_mp, cudaDevAttrMaxThreadsPerMultiProcessor, device_id);
|
||||||
|
cudaDeviceGetAttribute(&max_threads_per_block, cudaDevAttrMaxThreadsPerBlock, device_id);
|
||||||
|
cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id);
|
||||||
|
initialized = true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
thread_local CachedDeviceInfo cached;
|
||||||
|
cached.init();
|
||||||
|
|
||||||
|
NppStreamContext ctx = {};
|
||||||
|
ctx.hStream = stream;
|
||||||
|
ctx.nCudaDeviceId = cached.device_id;
|
||||||
|
ctx.nCudaDevAttrComputeCapabilityMajor = cached.cc_major;
|
||||||
|
ctx.nCudaDevAttrComputeCapabilityMinor = cached.cc_minor;
|
||||||
|
ctx.nMultiProcessorCount = cached.mp_count;
|
||||||
|
ctx.nMaxThreadsPerMultiProcessor = cached.max_threads_per_mp;
|
||||||
|
ctx.nMaxThreadsPerBlock = cached.max_threads_per_block;
|
||||||
|
ctx.nSharedMemPerBlock = cached.shared_mem_per_block;
|
||||||
|
cudaStreamGetFlags(stream, &ctx.nStreamFlags);
|
||||||
|
return ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Scalar division: c = a / b (device-resident, async) --------------------
|
||||||
|
|
||||||
|
inline void device_scalar_div(const float* a, const float* b, float* c, cudaStream_t stream) {
|
||||||
|
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
|
||||||
|
nppsDiv_32f_Ctx(b, a, c, 1, npp_ctx); // NPP: pDst[i] = pSrc2[i] / pSrc1[i]
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void device_scalar_div(const double* a, const double* b, double* c, cudaStream_t stream) {
|
||||||
|
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
|
||||||
|
nppsDiv_64f_Ctx(b, a, c, 1, npp_ctx); // NPP: pDst[i] = pSrc2[i] / pSrc1[i]
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Scalar negation: c = -a (device-resident, async) -----------------------
|
||||||
|
|
||||||
|
inline void device_scalar_neg(const float* a, float* c, cudaStream_t stream) {
|
||||||
|
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
|
||||||
|
nppsMulC_32f_Ctx(a, -1.0f, c, 1, npp_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void device_scalar_neg(const double* a, double* c, cudaStream_t stream) {
|
||||||
|
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
|
||||||
|
nppsMulC_64f_Ctx(a, -1.0, c, 1, npp_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Element-wise vector multiply: c[i] = a[i] * b[i] ----------------------
|
||||||
|
|
||||||
|
inline void device_cwiseProduct(const float* a, const float* b, float* c, int n, cudaStream_t stream) {
|
||||||
|
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
|
||||||
|
nppsMul_32f_Ctx(a, b, c, static_cast<size_t>(n), npp_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void device_cwiseProduct(const double* a, const double* b, double* c, int n, cudaStream_t stream) {
|
||||||
|
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
|
||||||
|
nppsMul_64f_Ctx(a, b, c, static_cast<size_t>(n), npp_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Element-wise vector division: c[i] = a[i] / b[i] ----------------------
|
||||||
|
|
||||||
|
inline void device_cwiseQuotient(const float* a, const float* b, float* c, int n, cudaStream_t stream) {
|
||||||
|
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
|
||||||
|
nppsDiv_32f_Ctx(b, a, c, static_cast<size_t>(n), npp_ctx); // NPP: dst = src2 / src1
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void device_cwiseQuotient(const double* a, const double* b, double* c, int n, cudaStream_t stream) {
|
||||||
|
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
|
||||||
|
nppsDiv_64f_Ctx(b, a, c, static_cast<size_t>(n), npp_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_DEVICE_SCALAR_OPS_H
|
||||||
115
Eigen/src/GPU/DeviceSolverExpr.h
Normal file
115
Eigen/src/GPU/DeviceSolverExpr.h
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Solver expression types for DeviceMatrix.
|
||||||
|
//
|
||||||
|
// Each expression maps 1:1 to cuSOLVER library calls:
|
||||||
|
// LltSolveExpr → cusolverDnXpotrf + cusolverDnXpotrs
|
||||||
|
// LuSolveExpr → cusolverDnXgetrf + cusolverDnXgetrs
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// d_X = d_A.llt().solve(d_B); // Cholesky solve
|
||||||
|
// d_X.device(ctx) = d_A.lu().solve(d_B); // LU solve on explicit stream
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_DEVICE_SOLVER_EXPR_H
|
||||||
|
#define EIGEN_GPU_DEVICE_SOLVER_EXPR_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
// Forward declarations.
|
||||||
|
template <typename Scalar_>
|
||||||
|
class DeviceMatrix;
|
||||||
|
class GpuContext;
|
||||||
|
|
||||||
|
// ---- LLT solve expression ---------------------------------------------------
|
||||||
|
// d_A.llt().solve(d_B) → LltSolveExpr → cusolverDnXpotrf + cusolverDnXpotrs
|
||||||
|
|
||||||
|
template <typename Scalar_, int UpLo_ = Lower>
|
||||||
|
class LltSolveExpr {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
enum { UpLo = UpLo_ };
|
||||||
|
|
||||||
|
LltSolveExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
|
||||||
|
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||||
|
const DeviceMatrix<Scalar>& rhs() const { return B_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DeviceMatrix<Scalar>& A_;
|
||||||
|
const DeviceMatrix<Scalar>& B_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- LU solve expression ----------------------------------------------------
|
||||||
|
// d_A.lu().solve(d_B) → LuSolveExpr → cusolverDnXgetrf + cusolverDnXgetrs
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class LuSolveExpr {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
|
||||||
|
LuSolveExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
|
||||||
|
const DeviceMatrix<Scalar>& matrix() const { return A_; }
|
||||||
|
const DeviceMatrix<Scalar>& rhs() const { return B_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DeviceMatrix<Scalar>& A_;
|
||||||
|
const DeviceMatrix<Scalar>& B_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- DeviceLLTView: d_A.llt() → view with .solve() and .device() -----------
|
||||||
|
|
||||||
|
template <typename Scalar_, int UpLo_ = Lower>
|
||||||
|
class DeviceLLTView {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
|
||||||
|
explicit DeviceLLTView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||||
|
|
||||||
|
/** Build a solve expression: d_A.llt().solve(d_B).
|
||||||
|
* The expression is evaluated when assigned to a DeviceMatrix. */
|
||||||
|
LltSolveExpr<Scalar, UpLo_> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
|
||||||
|
|
||||||
|
// For cached factorizations, use the explicit GpuLLT API directly:
|
||||||
|
// GpuLLT<double> llt;
|
||||||
|
// llt.compute(d_A);
|
||||||
|
// auto d_X1 = llt.solve(d_B1);
|
||||||
|
// auto d_X2 = llt.solve(d_B2);
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DeviceMatrix<Scalar>& mat_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- DeviceLUView: d_A.lu() → view with .solve() and .device() -------------
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class DeviceLUView {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
|
||||||
|
explicit DeviceLUView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
|
||||||
|
|
||||||
|
/** Build a solve expression: d_A.lu().solve(d_B). */
|
||||||
|
LuSolveExpr<Scalar> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
|
||||||
|
|
||||||
|
// For cached factorizations, use the explicit GpuLU API directly:
|
||||||
|
// GpuLU<double> lu;
|
||||||
|
// lu.compute(d_A);
|
||||||
|
// auto d_X1 = lu.solve(d_B1);
|
||||||
|
// auto d_X2 = lu.solve(d_B2);
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DeviceMatrix<Scalar>& mat_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_DEVICE_SOLVER_EXPR_H
|
||||||
139
Eigen/src/GPU/GpuContext.h
Normal file
139
Eigen/src/GPU/GpuContext.h
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Unified GPU execution context.
|
||||||
|
//
|
||||||
|
// GpuContext owns a CUDA stream and all NVIDIA library handles (cuBLAS,
|
||||||
|
// cuSOLVER, future cuDSS/cuSPARSE). It is the entry point for all GPU
|
||||||
|
// operations on DeviceMatrix.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuContext ctx; // explicit context
|
||||||
|
// d_C.device(ctx) = d_A * d_B; // GEMM on ctx's stream
|
||||||
|
//
|
||||||
|
// d_C = d_A * d_B; // thread-local default context
|
||||||
|
// GpuContext& ctx = GpuContext::threadLocal();
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_CONTEXT_H
|
||||||
|
#define EIGEN_GPU_CONTEXT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuBlasSupport.h"
|
||||||
|
#include "./CuSolverSupport.h"
|
||||||
|
#include <cusparse.h>
|
||||||
|
#include <cufft.h>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
/** \ingroup GPU_Module
|
||||||
|
* \class GpuContext
|
||||||
|
* \brief Unified GPU execution context owning a CUDA stream and library handles.
|
||||||
|
*
|
||||||
|
* Each GpuContext instance creates a dedicated CUDA stream, a cuBLAS handle,
|
||||||
|
* and a cuSOLVER handle, all bound to that stream. Multiple contexts enable
|
||||||
|
* concurrent execution on independent streams.
|
||||||
|
*
|
||||||
|
* A lazily-created thread-local default is available via threadLocal() for
|
||||||
|
* simple single-stream usage.
|
||||||
|
*/
|
||||||
|
class GpuContext {
|
||||||
|
public:
|
||||||
|
/** Create a new context with a dedicated CUDA stream. */
|
||||||
|
GpuContext() : owns_stream_(true) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
init_handles();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Create a context on an existing stream (e.g., stream 0 = nullptr).
|
||||||
|
* The caller retains ownership of the stream — this context will not destroy it. */
|
||||||
|
explicit GpuContext(cudaStream_t stream) : stream_(stream), owns_stream_(false) { init_handles(); }
|
||||||
|
|
||||||
|
~GpuContext() {
|
||||||
|
if (cusparse_) (void)cusparseDestroy(cusparse_);
|
||||||
|
if (cusolver_) (void)cusolverDnDestroy(cusolver_);
|
||||||
|
if (cublas_lt_) (void)cublasLtDestroy(cublas_lt_);
|
||||||
|
if (cublas_) (void)cublasDestroy(cublas_);
|
||||||
|
if (owns_stream_ && stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Non-copyable, non-movable (owns library handles).
|
||||||
|
GpuContext(const GpuContext&) = delete;
|
||||||
|
GpuContext& operator=(const GpuContext&) = delete;
|
||||||
|
|
||||||
|
/** Get the thread-local default context.
|
||||||
|
* If setThreadLocal() has been called, returns that context.
|
||||||
|
* Otherwise lazily creates a new context with a dedicated stream. */
|
||||||
|
static GpuContext& threadLocal() {
|
||||||
|
GpuContext* override = tl_override_ptr();
|
||||||
|
if (override) return *override;
|
||||||
|
thread_local GpuContext ctx;
|
||||||
|
return ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Override the thread-local default context for this thread.
|
||||||
|
* The caller retains ownership of \p ctx — it must outlive all uses.
|
||||||
|
* Pass nullptr to restore the lazily-created default. */
|
||||||
|
static void setThreadLocal(GpuContext* ctx) { tl_override_ptr() = ctx; }
|
||||||
|
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
cublasHandle_t cublasHandle() const { return cublas_; }
|
||||||
|
cusolverDnHandle_t cusolverHandle() const { return cusolver_; }
|
||||||
|
|
||||||
|
/** cuBLASLt handle (lazy-initialized on first GEMM call). */
|
||||||
|
cublasLtHandle_t cublasLtHandle() const {
|
||||||
|
if (!cublas_lt_) {
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasLtCreate(&cublas_lt_));
|
||||||
|
}
|
||||||
|
return cublas_lt_;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Workspace buffer for cublasLtMatmul (grown lazily by cublaslt_gemm).
|
||||||
|
* Not thread-safe — all GEMM calls must be on this context's stream. */
|
||||||
|
internal::DeviceBuffer* gemmWorkspace() const { return &gemm_workspace_; }
|
||||||
|
|
||||||
|
/** cuSPARSE handle (lazy-initialized on first call). */
|
||||||
|
cusparseHandle_t cusparseHandle() const {
|
||||||
|
if (!cusparse_) {
|
||||||
|
cusparseStatus_t s1 = cusparseCreate(&cusparse_);
|
||||||
|
eigen_assert(s1 == CUSPARSE_STATUS_SUCCESS && "cusparseCreate failed");
|
||||||
|
EIGEN_UNUSED_VARIABLE(s1);
|
||||||
|
cusparseStatus_t s2 = cusparseSetStream(cusparse_, stream_);
|
||||||
|
eigen_assert(s2 == CUSPARSE_STATUS_SUCCESS && "cusparseSetStream failed");
|
||||||
|
EIGEN_UNUSED_VARIABLE(s2);
|
||||||
|
}
|
||||||
|
return cusparse_;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cublasHandle_t cublas_ = nullptr;
|
||||||
|
cusolverDnHandle_t cusolver_ = nullptr;
|
||||||
|
mutable cublasLtHandle_t cublas_lt_ = nullptr; // lazy
|
||||||
|
mutable cusparseHandle_t cusparse_ = nullptr; // lazy
|
||||||
|
mutable internal::DeviceBuffer gemm_workspace_; // lazy
|
||||||
|
bool owns_stream_ = true;
|
||||||
|
|
||||||
|
static GpuContext*& tl_override_ptr() {
|
||||||
|
thread_local GpuContext* ptr = nullptr;
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void init_handles() {
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_CONTEXT_H
|
||||||
232
Eigen/src/GPU/GpuEigenSolver.h
Normal file
232
Eigen/src/GPU/GpuEigenSolver.h
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU self-adjoint eigenvalue decomposition using cuSOLVER.
|
||||||
|
//
|
||||||
|
// Wraps cusolverDnXsyevd (symmetric/Hermitian divide-and-conquer).
|
||||||
|
// Stores eigenvalues and eigenvectors on device.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuSelfAdjointEigenSolver<double> es(A);
|
||||||
|
// VectorXd eigenvals = es.eigenvalues();
|
||||||
|
// MatrixXd eigenvecs = es.eigenvectors();
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_EIGENSOLVER_H
|
||||||
|
#define EIGEN_GPU_EIGENSOLVER_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuSolverSupport.h"
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class GpuSelfAdjointEigenSolver {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||||
|
using RealVector = Matrix<RealScalar, Dynamic, 1>;
|
||||||
|
|
||||||
|
/** Eigenvalue-only or eigenvalues + eigenvectors. */
|
||||||
|
enum ComputeMode { EigenvaluesOnly, ComputeEigenvectors };
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver() { init_context(); }
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
explicit GpuSelfAdjointEigenSolver(const EigenBase<InputType>& A, ComputeMode mode = ComputeEigenvectors) {
|
||||||
|
init_context();
|
||||||
|
compute(A, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
~GpuSelfAdjointEigenSolver() {
|
||||||
|
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver(const GpuSelfAdjointEigenSolver&) = delete;
|
||||||
|
GpuSelfAdjointEigenSolver& operator=(const GpuSelfAdjointEigenSolver&) = delete;
|
||||||
|
|
||||||
|
// ---- Factorization -------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
GpuSelfAdjointEigenSolver& compute(const EigenBase<InputType>& A, ComputeMode mode = ComputeEigenvectors) {
|
||||||
|
eigen_assert(A.rows() == A.cols() && "GpuSelfAdjointEigenSolver requires a square matrix");
|
||||||
|
mode_ = mode;
|
||||||
|
n_ = A.rows();
|
||||||
|
info_ = InvalidInput;
|
||||||
|
info_synced_ = false;
|
||||||
|
|
||||||
|
if (n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
info_synced_ = true;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
const PlainMatrix mat(A.derived());
|
||||||
|
lda_ = static_cast<int64_t>(n_);
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||||
|
|
||||||
|
// syevd overwrites A with eigenvectors (if requested).
|
||||||
|
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver& compute(const DeviceMatrix<Scalar>& d_A, ComputeMode mode = ComputeEigenvectors) {
|
||||||
|
eigen_assert(d_A.rows() == d_A.cols() && "GpuSelfAdjointEigenSolver requires a square matrix");
|
||||||
|
mode_ = mode;
|
||||||
|
n_ = d_A.rows();
|
||||||
|
info_ = InvalidInput;
|
||||||
|
info_synced_ = false;
|
||||||
|
|
||||||
|
if (n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
info_synced_ = true;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
d_A.waitReady(stream_);
|
||||||
|
lda_ = static_cast<int64_t>(n_);
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||||
|
|
||||||
|
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors -----------------------------------------------------------
|
||||||
|
|
||||||
|
ComputationInfo info() const {
|
||||||
|
sync_info();
|
||||||
|
return info_;
|
||||||
|
}
|
||||||
|
|
||||||
|
Index cols() const { return n_; }
|
||||||
|
Index rows() const { return n_; }
|
||||||
|
|
||||||
|
// TODO: Add device-side accessors (deviceEigenvalues(), deviceEigenvectors())
|
||||||
|
// returning DeviceMatrix views of the internal buffers, so users can chain
|
||||||
|
// GPU operations without round-tripping through host memory.
|
||||||
|
|
||||||
|
/** Eigenvalues in ascending order. Downloads from device. */
|
||||||
|
RealVector eigenvalues() const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success);
|
||||||
|
RealVector W(n_);
|
||||||
|
if (n_ > 0) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpy(W.data(), d_W_.ptr, static_cast<size_t>(n_) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
|
||||||
|
}
|
||||||
|
return W;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Eigenvectors (columns). Downloads from device.
|
||||||
|
* Requires ComputeEigenvectors mode. */
|
||||||
|
PlainMatrix eigenvectors() const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success);
|
||||||
|
eigen_assert(mode_ == ComputeEigenvectors && "eigenvectors() requires ComputeEigenvectors mode");
|
||||||
|
PlainMatrix V(n_, n_);
|
||||||
|
if (n_ > 0) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(V.data(), d_A_.ptr,
|
||||||
|
static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
}
|
||||||
|
return V;
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cusolverDnHandle_t handle_ = nullptr;
|
||||||
|
internal::CusolverParams params_;
|
||||||
|
internal::DeviceBuffer d_A_; // overwritten with eigenvectors by syevd
|
||||||
|
internal::DeviceBuffer d_W_; // eigenvalues (RealScalar, length n)
|
||||||
|
internal::DeviceBuffer d_scratch_; // workspace + info
|
||||||
|
size_t scratch_size_ = 0;
|
||||||
|
std::vector<char> h_workspace_;
|
||||||
|
ComputeMode mode_ = ComputeEigenvectors;
|
||||||
|
Index n_ = 0;
|
||||||
|
int64_t lda_ = 0;
|
||||||
|
ComputationInfo info_ = InvalidInput;
|
||||||
|
int info_word_ = 0;
|
||||||
|
bool info_synced_ = true;
|
||||||
|
|
||||||
|
void init_context() {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||||
|
ensure_scratch(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ensure_scratch(size_t workspace_bytes) {
|
||||||
|
constexpr size_t kAlign = 16;
|
||||||
|
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||||
|
size_t needed = workspace_bytes + sizeof(int);
|
||||||
|
if (needed > scratch_size_) {
|
||||||
|
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
d_scratch_ = internal::DeviceBuffer(needed);
|
||||||
|
scratch_size_ = needed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||||
|
int* scratch_info() const {
|
||||||
|
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||||
|
}
|
||||||
|
|
||||||
|
void sync_info() const {
|
||||||
|
if (!info_synced_) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
const_cast<GpuSelfAdjointEigenSolver*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||||
|
const_cast<GpuSelfAdjointEigenSolver*>(this)->info_synced_ = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void factorize() {
|
||||||
|
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||||
|
constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
|
||||||
|
|
||||||
|
info_synced_ = false;
|
||||||
|
info_ = InvalidInput;
|
||||||
|
|
||||||
|
d_W_ = internal::DeviceBuffer(static_cast<size_t>(n_) * sizeof(RealScalar));
|
||||||
|
|
||||||
|
const cusolverEigMode_t jobz =
|
||||||
|
(mode_ == ComputeEigenvectors) ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
|
||||||
|
|
||||||
|
// Use lower triangle (standard convention).
|
||||||
|
constexpr cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
|
||||||
|
|
||||||
|
size_t dev_ws = 0, host_ws = 0;
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd_bufferSize(handle_, params_.p, jobz, uplo, static_cast<int64_t>(n_), dtype,
|
||||||
|
d_A_.ptr, lda_, rtype, d_W_.ptr, dtype, &dev_ws, &host_ws));
|
||||||
|
|
||||||
|
ensure_scratch(dev_ws);
|
||||||
|
h_workspace_.resize(host_ws);
|
||||||
|
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd(handle_, params_.p, jobz, uplo, static_cast<int64_t>(n_), dtype, d_A_.ptr,
|
||||||
|
lda_, rtype, d_W_.ptr, dtype, scratch_workspace(), dev_ws,
|
||||||
|
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_EIGENSOLVER_H
|
||||||
308
Eigen/src/GPU/GpuFFT.h
Normal file
308
Eigen/src/GPU/GpuFFT.h
Normal file
@@ -0,0 +1,308 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU FFT via cuFFT.
|
||||||
|
//
|
||||||
|
// Standalone GPU FFT class with plan caching. Supports 1D and 2D transforms:
|
||||||
|
// C2C (complex-to-complex), R2C (real-to-complex), C2R (complex-to-real).
|
||||||
|
//
|
||||||
|
// Inverse transforms are scaled by 1/n (1D) or 1/(n*m) (2D) so that
|
||||||
|
// inv(fwd(x)) == x, matching Eigen's FFT convention.
|
||||||
|
//
|
||||||
|
// cuFFT plans are cached by (size, type) and reused across calls.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuFFT<float> fft;
|
||||||
|
// VectorXcf X = fft.fwd(x); // 1D C2C or R2C
|
||||||
|
// VectorXcf y = fft.inv(X); // 1D C2C inverse
|
||||||
|
// VectorXf r = fft.invReal(X, n); // 1D C2R inverse
|
||||||
|
// MatrixXcf B = fft.fwd2d(A); // 2D C2C forward
|
||||||
|
// MatrixXcf C = fft.inv2d(B); // 2D C2C inverse
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_FFT_H
|
||||||
|
#define EIGEN_GPU_FFT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuFftSupport.h"
|
||||||
|
#include "./CuBlasSupport.h"
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class GpuFFT {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using ComplexVector = Matrix<Complex, Dynamic, 1>;
|
||||||
|
using RealVector = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using ComplexMatrix = Matrix<Complex, Dynamic, Dynamic, ColMajor>;
|
||||||
|
|
||||||
|
GpuFFT() {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
~GpuFFT() {
|
||||||
|
for (auto& kv : plans_) (void)cufftDestroy(kv.second);
|
||||||
|
if (cublas_) (void)cublasDestroy(cublas_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuFFT(const GpuFFT&) = delete;
|
||||||
|
GpuFFT& operator=(const GpuFFT&) = delete;
|
||||||
|
|
||||||
|
// ---- 1D Complex-to-Complex ------------------------------------------------
|
||||||
|
|
||||||
|
/** Forward 1D C2C FFT. */
|
||||||
|
template <typename Derived>
|
||||||
|
ComplexVector fwd(const MatrixBase<Derived>& x,
|
||||||
|
typename std::enable_if<NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
|
||||||
|
const ComplexVector input(x.derived());
|
||||||
|
const int n = static_cast<int>(input.size());
|
||||||
|
if (n == 0) return ComplexVector(0);
|
||||||
|
|
||||||
|
ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
|
||||||
|
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
|
||||||
|
static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
|
||||||
|
|
||||||
|
ComplexVector result(n);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Inverse 1D C2C FFT. Scaled by 1/n. */
|
||||||
|
template <typename Derived>
|
||||||
|
ComplexVector inv(const MatrixBase<Derived>& X) {
|
||||||
|
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv() requires complex input");
|
||||||
|
const ComplexVector input(X.derived());
|
||||||
|
const int n = static_cast<int>(input.size());
|
||||||
|
if (n == 0) return ComplexVector(0);
|
||||||
|
|
||||||
|
ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
|
||||||
|
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
|
||||||
|
static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
|
||||||
|
|
||||||
|
// Scale by 1/n.
|
||||||
|
scale_device(static_cast<Complex*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
|
||||||
|
|
||||||
|
ComplexVector result(n);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 1D Real-to-Complex ---------------------------------------------------
|
||||||
|
|
||||||
|
/** Forward 1D R2C FFT. Returns n/2+1 complex values (half-spectrum). */
|
||||||
|
template <typename Derived>
|
||||||
|
ComplexVector fwd(const MatrixBase<Derived>& x,
|
||||||
|
typename std::enable_if<!NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
|
||||||
|
const RealVector input(x.derived());
|
||||||
|
const int n = static_cast<int>(input.size());
|
||||||
|
if (n == 0) return ComplexVector(0);
|
||||||
|
|
||||||
|
const int n_complex = n / 2 + 1;
|
||||||
|
ensure_buffers(n * sizeof(Scalar), n_complex * sizeof(Complex));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_in_.ptr, input.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
cufftHandle plan = get_plan_1d(n, internal::cufft_r2c_type<Scalar>::value);
|
||||||
|
EIGEN_CUFFT_CHECK(
|
||||||
|
internal::cufftExecR2C_dispatch(plan, static_cast<Scalar*>(d_in_.ptr), static_cast<Complex*>(d_out_.ptr)));
|
||||||
|
|
||||||
|
ComplexVector result(n_complex);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(result.data(), d_out_.ptr, n_complex * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 1D Complex-to-Real ---------------------------------------------------
|
||||||
|
|
||||||
|
/** Inverse 1D C2R FFT. Input is n/2+1 complex values, output is nfft real values.
|
||||||
|
* Scaled by 1/nfft. Caller must specify nfft (original real signal length). */
|
||||||
|
template <typename Derived>
|
||||||
|
RealVector invReal(const MatrixBase<Derived>& X, Index nfft) {
|
||||||
|
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "invReal() requires complex input");
|
||||||
|
const ComplexVector input(X.derived());
|
||||||
|
const int n = static_cast<int>(nfft);
|
||||||
|
const int n_complex = n / 2 + 1;
|
||||||
|
eigen_assert(input.size() == n_complex);
|
||||||
|
if (n == 0) return RealVector(0);
|
||||||
|
|
||||||
|
ensure_buffers(n_complex * sizeof(Complex), n * sizeof(Scalar));
|
||||||
|
// cuFFT C2R may overwrite the input, so we copy to d_in_.
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_in_.ptr, input.data(), n_complex * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
cufftHandle plan = get_plan_1d(n, internal::cufft_c2r_type<Scalar>::value);
|
||||||
|
EIGEN_CUFFT_CHECK(
|
||||||
|
internal::cufftExecC2R_dispatch(plan, static_cast<Complex*>(d_in_.ptr), static_cast<Scalar*>(d_out_.ptr)));
|
||||||
|
|
||||||
|
// Scale by 1/n.
|
||||||
|
scale_device_real(static_cast<Scalar*>(d_out_.ptr), n, Scalar(1) / Scalar(n));
|
||||||
|
|
||||||
|
RealVector result(n);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(result.data(), d_out_.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 2D Complex-to-Complex ------------------------------------------------
|
||||||
|
|
||||||
|
/** Forward 2D C2C FFT. Input and output are rows x cols complex matrices. */
|
||||||
|
template <typename Derived>
|
||||||
|
ComplexMatrix fwd2d(const MatrixBase<Derived>& A) {
|
||||||
|
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "fwd2d() requires complex input");
|
||||||
|
const ComplexMatrix input(A.derived());
|
||||||
|
const int rows = static_cast<int>(input.rows());
|
||||||
|
const int cols = static_cast<int>(input.cols());
|
||||||
|
if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
|
||||||
|
|
||||||
|
const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
|
||||||
|
ensure_buffers(total, total);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
|
||||||
|
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
|
||||||
|
static_cast<Complex*>(d_out_.ptr), CUFFT_FORWARD));
|
||||||
|
|
||||||
|
ComplexMatrix result(rows, cols);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Inverse 2D C2C FFT. Scaled by 1/(rows*cols). */
|
||||||
|
template <typename Derived>
|
||||||
|
ComplexMatrix inv2d(const MatrixBase<Derived>& A) {
|
||||||
|
static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv2d() requires complex input");
|
||||||
|
const ComplexMatrix input(A.derived());
|
||||||
|
const int rows = static_cast<int>(input.rows());
|
||||||
|
const int cols = static_cast<int>(input.cols());
|
||||||
|
if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
|
||||||
|
|
||||||
|
const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
|
||||||
|
ensure_buffers(total, total);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.ptr, input.data(), total, cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
|
||||||
|
EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.ptr),
|
||||||
|
static_cast<Complex*>(d_out_.ptr), CUFFT_INVERSE));
|
||||||
|
|
||||||
|
// Scale by 1/(rows*cols).
|
||||||
|
const int total_elems = rows * cols;
|
||||||
|
scale_device(static_cast<Complex*>(d_out_.ptr), total_elems, Scalar(1) / Scalar(total_elems));
|
||||||
|
|
||||||
|
ComplexMatrix result(rows, cols);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.ptr, total, cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors ------------------------------------------------------------
|
||||||
|
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cublasHandle_t cublas_ = nullptr;
|
||||||
|
std::map<int64_t, cufftHandle> plans_;
|
||||||
|
internal::DeviceBuffer d_in_;
|
||||||
|
internal::DeviceBuffer d_out_;
|
||||||
|
size_t d_in_size_ = 0;
|
||||||
|
size_t d_out_size_ = 0;
|
||||||
|
|
||||||
|
void ensure_buffers(size_t in_bytes, size_t out_bytes) {
|
||||||
|
if (in_bytes > d_in_size_) {
|
||||||
|
if (d_in_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
d_in_ = internal::DeviceBuffer(in_bytes);
|
||||||
|
d_in_size_ = in_bytes;
|
||||||
|
}
|
||||||
|
if (out_bytes > d_out_size_) {
|
||||||
|
if (d_out_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
d_out_ = internal::DeviceBuffer(out_bytes);
|
||||||
|
d_out_size_ = out_bytes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Plan key encoding: rank (1 bit) | type (4 bits) | dims
|
||||||
|
static int64_t plan_key_1d(int n, cufftType type) { return (int64_t(n) << 5) | (int64_t(type) << 1) | 0; }
|
||||||
|
|
||||||
|
static int64_t plan_key_2d(int rows, int cols, cufftType type) {
|
||||||
|
return (int64_t(rows) << 35) | (int64_t(cols) << 5) | (int64_t(type) << 1) | 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
cufftHandle get_plan_1d(int n, cufftType type) {
|
||||||
|
int64_t key = plan_key_1d(n, type);
|
||||||
|
auto it = plans_.find(key);
|
||||||
|
if (it != plans_.end()) return it->second;
|
||||||
|
|
||||||
|
cufftHandle plan;
|
||||||
|
EIGEN_CUFFT_CHECK(cufftPlan1d(&plan, n, type, /*batch=*/1));
|
||||||
|
EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
|
||||||
|
plans_[key] = plan;
|
||||||
|
return plan;
|
||||||
|
}
|
||||||
|
|
||||||
|
cufftHandle get_plan_2d(int rows, int cols, cufftType type) {
|
||||||
|
int64_t key = plan_key_2d(rows, cols, type);
|
||||||
|
auto it = plans_.find(key);
|
||||||
|
if (it != plans_.end()) return it->second;
|
||||||
|
|
||||||
|
// cuFFT uses row-major (C order) for 2D: first dim = rows, second = cols.
|
||||||
|
// Eigen matrices are column-major, so we pass (cols, rows) to cuFFT
|
||||||
|
// to get the correct 2D transform.
|
||||||
|
cufftHandle plan;
|
||||||
|
EIGEN_CUFFT_CHECK(cufftPlan2d(&plan, cols, rows, type));
|
||||||
|
EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
|
||||||
|
plans_[key] = plan;
|
||||||
|
return plan;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scale complex array on device using cuBLAS scal.
|
||||||
|
void scale_device(Complex* d_ptr, int n, Scalar alpha) { scale_complex(cublas_, d_ptr, n, alpha); }
|
||||||
|
|
||||||
|
// Scale real array on device using cuBLAS scal.
|
||||||
|
void scale_device_real(Scalar* d_ptr, int n, Scalar alpha) { scale_real(cublas_, d_ptr, n, alpha); }
|
||||||
|
|
||||||
|
// Type-dispatched cuBLAS scal wrappers (C++14 compatible).
|
||||||
|
static void scale_complex(cublasHandle_t h, std::complex<float>* p, int n, float a) {
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasCsscal(h, n, &a, reinterpret_cast<cuComplex*>(p), 1));
|
||||||
|
}
|
||||||
|
static void scale_complex(cublasHandle_t h, std::complex<double>* p, int n, double a) {
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasZdscal(h, n, &a, reinterpret_cast<cuDoubleComplex*>(p), 1));
|
||||||
|
}
|
||||||
|
static void scale_real(cublasHandle_t h, float* p, int n, float a) {
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSscal(h, n, &a, p, 1));
|
||||||
|
}
|
||||||
|
static void scale_real(cublasHandle_t h, double* p, int n, double a) {
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasDscal(h, n, &a, p, 1));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_FFT_H
|
||||||
385
Eigen/src/GPU/GpuLLT.h
Normal file
385
Eigen/src/GPU/GpuLLT.h
Normal file
@@ -0,0 +1,385 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Eigen Authors
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU Cholesky (LLT) decomposition using cuSOLVER.
|
||||||
|
//
|
||||||
|
// Unlike Eigen's CPU LLT<MatrixType>, GpuLLT keeps the factored Cholesky
|
||||||
|
// factor in device memory for the lifetime of the object. Multiple solves
|
||||||
|
// against the same factor therefore only transfer the RHS and solution
|
||||||
|
// vectors, not the factor itself.
|
||||||
|
//
|
||||||
|
// Requires CUDA 11.0+ (cusolverDnXpotrf / cusolverDnXpotrs generic API).
|
||||||
|
// Requires CUDA 11.4+ (cusolverDnX generic API + cudaMallocAsync).
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuLLT<double> llt(A); // upload A, potrf, L stays on device
|
||||||
|
// if (llt.info() != Success) { ... }
|
||||||
|
// MatrixXd x1 = llt.solve(b1); // potrs, only b1 transferred
|
||||||
|
// MatrixXd x2 = llt.solve(b2); // L already on device
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_LLT_H
|
||||||
|
#define EIGEN_GPU_LLT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuSolverSupport.h"
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
/** \ingroup GPU_Module
|
||||||
|
* \class GpuLLT
|
||||||
|
* \brief GPU Cholesky (LL^T) decomposition via cuSOLVER
|
||||||
|
*
|
||||||
|
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
|
||||||
|
* \tparam UpLo_ Triangle used: Lower (default) or Upper
|
||||||
|
*
|
||||||
|
* Factorizes a symmetric positive-definite matrix A = LL^H on the GPU and
|
||||||
|
* caches the factor L in device memory. Each subsequent solve(B) uploads only
|
||||||
|
* B, calls cusolverDnXpotrs, and downloads the result — the factor is not
|
||||||
|
* re-transferred.
|
||||||
|
*
|
||||||
|
* Each GpuLLT object owns a dedicated CUDA stream and cuSOLVER handle,
|
||||||
|
* enabling concurrent factorizations from multiple objects on the same host
|
||||||
|
* thread.
|
||||||
|
*/
|
||||||
|
template <typename Scalar_, int UpLo_ = Lower>
|
||||||
|
class GpuLLT {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||||
|
|
||||||
|
enum { UpLo = UpLo_ };
|
||||||
|
|
||||||
|
// ---- Construction / destruction ------------------------------------------
|
||||||
|
|
||||||
|
/** Default constructor. Does not factorize; call compute() before solve(). */
|
||||||
|
GpuLLT() { init_context(); }
|
||||||
|
|
||||||
|
/** Factor A immediately. Equivalent to GpuLLT llt; llt.compute(A). */
|
||||||
|
template <typename InputType>
|
||||||
|
explicit GpuLLT(const EigenBase<InputType>& A) {
|
||||||
|
init_context();
|
||||||
|
compute(A);
|
||||||
|
}
|
||||||
|
|
||||||
|
~GpuLLT() {
|
||||||
|
// Ignore errors in destructors — cannot propagate.
|
||||||
|
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Non-copyable (owns device memory and library handles).
|
||||||
|
GpuLLT(const GpuLLT&) = delete;
|
||||||
|
GpuLLT& operator=(const GpuLLT&) = delete;
|
||||||
|
|
||||||
|
// Movable.
|
||||||
|
GpuLLT(GpuLLT&& o) noexcept
|
||||||
|
: stream_(o.stream_),
|
||||||
|
handle_(o.handle_),
|
||||||
|
params_(std::move(o.params_)),
|
||||||
|
d_factor_(std::move(o.d_factor_)),
|
||||||
|
factor_alloc_size_(o.factor_alloc_size_),
|
||||||
|
d_scratch_(std::move(o.d_scratch_)),
|
||||||
|
scratch_size_(o.scratch_size_),
|
||||||
|
h_workspace_(std::move(o.h_workspace_)),
|
||||||
|
n_(o.n_),
|
||||||
|
lda_(o.lda_),
|
||||||
|
info_(o.info_),
|
||||||
|
info_word_(o.info_word_),
|
||||||
|
info_synced_(o.info_synced_) {
|
||||||
|
o.stream_ = nullptr;
|
||||||
|
o.handle_ = nullptr;
|
||||||
|
o.factor_alloc_size_ = 0;
|
||||||
|
o.scratch_size_ = 0;
|
||||||
|
o.n_ = 0;
|
||||||
|
o.info_ = InvalidInput;
|
||||||
|
o.info_word_ = 0;
|
||||||
|
o.info_synced_ = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuLLT& operator=(GpuLLT&& o) noexcept {
|
||||||
|
if (this != &o) {
|
||||||
|
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
stream_ = o.stream_;
|
||||||
|
handle_ = o.handle_;
|
||||||
|
params_ = std::move(o.params_);
|
||||||
|
d_factor_ = std::move(o.d_factor_);
|
||||||
|
factor_alloc_size_ = o.factor_alloc_size_;
|
||||||
|
d_scratch_ = std::move(o.d_scratch_);
|
||||||
|
scratch_size_ = o.scratch_size_;
|
||||||
|
h_workspace_ = std::move(o.h_workspace_);
|
||||||
|
n_ = o.n_;
|
||||||
|
lda_ = o.lda_;
|
||||||
|
info_ = o.info_;
|
||||||
|
info_word_ = o.info_word_;
|
||||||
|
info_synced_ = o.info_synced_;
|
||||||
|
o.stream_ = nullptr;
|
||||||
|
o.handle_ = nullptr;
|
||||||
|
o.factor_alloc_size_ = 0;
|
||||||
|
o.scratch_size_ = 0;
|
||||||
|
o.n_ = 0;
|
||||||
|
o.info_ = InvalidInput;
|
||||||
|
o.info_word_ = 0;
|
||||||
|
o.info_synced_ = true;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Factorization -------------------------------------------------------
|
||||||
|
|
||||||
|
/** Compute the Cholesky factorization of A (host matrix).
|
||||||
|
*
|
||||||
|
* Uploads A to device memory, calls cusolverDnXpotrf, and retains the
|
||||||
|
* factored matrix on device. Any previous factorization is overwritten.
|
||||||
|
*/
|
||||||
|
template <typename InputType>
|
||||||
|
GpuLLT& compute(const EigenBase<InputType>& A) {
|
||||||
|
eigen_assert(A.rows() == A.cols());
|
||||||
|
if (!begin_compute(A.rows())) return *this;
|
||||||
|
|
||||||
|
// Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions).
|
||||||
|
const PlainMatrix mat(A.derived());
|
||||||
|
lda_ = static_cast<int64_t>(mat.rows());
|
||||||
|
allocate_factor_storage();
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_factor_.ptr, mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Compute the Cholesky factorization from a device-resident matrix (D2D copy). */
|
||||||
|
GpuLLT& compute(const DeviceMatrix<Scalar>& d_A) {
|
||||||
|
eigen_assert(d_A.rows() == d_A.cols());
|
||||||
|
if (!begin_compute(d_A.rows())) return *this;
|
||||||
|
|
||||||
|
lda_ = static_cast<int64_t>(d_A.rows());
|
||||||
|
d_A.waitReady(stream_);
|
||||||
|
allocate_factor_storage();
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_factor_.ptr, d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, stream_));
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Compute the Cholesky factorization from a device matrix (move, no copy). */
|
||||||
|
GpuLLT& compute(DeviceMatrix<Scalar>&& d_A) {
|
||||||
|
eigen_assert(d_A.rows() == d_A.cols());
|
||||||
|
if (!begin_compute(d_A.rows())) return *this;
|
||||||
|
|
||||||
|
lda_ = static_cast<int64_t>(d_A.rows());
|
||||||
|
d_A.waitReady(stream_);
|
||||||
|
d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve ---------------------------------------------------------------
|
||||||
|
|
||||||
|
/** Solve A * X = B using the cached Cholesky factor (host → host).
|
||||||
|
*
|
||||||
|
* Uploads B to device memory, calls cusolverDnXpotrs using the factor
|
||||||
|
* retained from compute(), and returns the solution X on the host.
|
||||||
|
* The factor is not re-transferred; only B goes up and X comes down.
|
||||||
|
*
|
||||||
|
* \pre compute() must have been called and info() == Success.
|
||||||
|
* \returns X such that A * X ≈ B
|
||||||
|
*/
|
||||||
|
template <typename Rhs>
|
||||||
|
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
|
||||||
|
const_cast<GpuLLT*>(this)->sync_info();
|
||||||
|
eigen_assert(info_ == Success && "GpuLLT::solve called on a failed or uninitialized factorization");
|
||||||
|
eigen_assert(B.rows() == n_);
|
||||||
|
|
||||||
|
const PlainMatrix rhs(B);
|
||||||
|
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||||
|
const int64_t ldb = static_cast<int64_t>(rhs.rows());
|
||||||
|
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_x_ptr, rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
|
||||||
|
});
|
||||||
|
|
||||||
|
PlainMatrix X(n_, B.cols());
|
||||||
|
int solve_info = 0;
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
|
||||||
|
eigen_assert(solve_info == 0 && "cusolverDnXpotrs reported an error");
|
||||||
|
return X;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Solve A * X = B with device-resident RHS. Fully async.
|
||||||
|
*
|
||||||
|
* All work is enqueued on this solver's stream. Returns a DeviceMatrix
|
||||||
|
* with a recorded ready event — no host synchronization occurs.
|
||||||
|
* The caller should check info() after compute() to verify the
|
||||||
|
* factorization succeeded; this method does not check.
|
||||||
|
*/
|
||||||
|
DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
|
||||||
|
eigen_assert(d_B.rows() == n_);
|
||||||
|
d_B.waitReady(stream_);
|
||||||
|
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
||||||
|
const int64_t ldb = static_cast<int64_t>(d_B.rows());
|
||||||
|
return solve_impl(nrhs, ldb, [&](Scalar* d_x_ptr) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_x_ptr, d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors -----------------------------------------------------------
|
||||||
|
|
||||||
|
/** Returns Success if the last compute() succeeded, NumericalIssue otherwise.
|
||||||
|
* Lazily synchronizes the stream on first call after compute(). */
|
||||||
|
ComputationInfo info() const {
|
||||||
|
const_cast<GpuLLT*>(this)->sync_info();
|
||||||
|
return info_;
|
||||||
|
}
|
||||||
|
|
||||||
|
Index rows() const { return n_; }
|
||||||
|
Index cols() const { return n_; }
|
||||||
|
|
||||||
|
/** Returns the CUDA stream owned by this object.
|
||||||
|
* Advanced users may submit additional GPU work on this stream
|
||||||
|
* to overlap with or chain after GpuLLT operations. */
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cusolverDnHandle_t handle_ = nullptr;
|
||||||
|
internal::CusolverParams params_; // cuSOLVER params (created once, reused)
|
||||||
|
internal::DeviceBuffer d_factor_; // factored L (or U) on device (grows, never shrinks)
|
||||||
|
size_t factor_alloc_size_ = 0; // current d_factor_ allocation size
|
||||||
|
internal::DeviceBuffer d_scratch_; // combined workspace + info word (grows, never shrinks)
|
||||||
|
size_t scratch_size_ = 0; // current scratch allocation size
|
||||||
|
std::vector<char> h_workspace_; // host workspace (kept alive until next compute)
|
||||||
|
Index n_ = 0;
|
||||||
|
int64_t lda_ = 0;
|
||||||
|
ComputationInfo info_ = InvalidInput;
|
||||||
|
int info_word_ = 0; // host-side target for async info download
|
||||||
|
bool info_synced_ = true; // has the stream been synced for info?
|
||||||
|
|
||||||
|
bool begin_compute(Index rows) {
|
||||||
|
n_ = rows;
|
||||||
|
info_ = InvalidInput;
|
||||||
|
if (n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t factorBytes() const { return rhsBytes(static_cast<int64_t>(n_), lda_); }
|
||||||
|
|
||||||
|
static size_t rhsBytes(int64_t cols, int64_t outer_stride) {
|
||||||
|
return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
|
||||||
|
}
|
||||||
|
|
||||||
|
void allocate_factor_storage() {
|
||||||
|
size_t needed = factorBytes();
|
||||||
|
if (needed > factor_alloc_size_) {
|
||||||
|
d_factor_ = internal::DeviceBuffer(needed);
|
||||||
|
factor_alloc_size_ = needed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure d_scratch_ is at least `workspace_bytes + sizeof(int)`.
|
||||||
|
// Layout: [workspace (workspace_bytes) | info_word (sizeof(int))].
|
||||||
|
// Ensure d_scratch_ can hold workspace_bytes + an aligned info word.
|
||||||
|
// Grows but never shrinks. Syncs the stream before reallocating to
|
||||||
|
// avoid freeing memory that async kernels may still be using.
|
||||||
|
void ensure_scratch(size_t workspace_bytes) {
|
||||||
|
// Round up so the info word is naturally aligned.
|
||||||
|
// 16-byte alignment for optimal GPU memory access.
|
||||||
|
constexpr size_t kAlign = 16;
|
||||||
|
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||||
|
size_t needed = workspace_bytes + sizeof(int);
|
||||||
|
if (needed > scratch_size_) {
|
||||||
|
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
d_scratch_ = internal::DeviceBuffer(needed);
|
||||||
|
scratch_size_ = needed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||||
|
int* scratch_info() const {
|
||||||
|
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename CopyRhs>
|
||||||
|
DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, CopyRhs&& copy_rhs) const {
|
||||||
|
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||||
|
constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_, ColMajor>::value;
|
||||||
|
|
||||||
|
Scalar* d_x_ptr = nullptr;
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_x_ptr), rhsBytes(nrhs, ldb)));
|
||||||
|
copy_rhs(d_x_ptr);
|
||||||
|
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, static_cast<int64_t>(n_), nrhs, dtype,
|
||||||
|
d_factor_.ptr, lda_, dtype, d_x_ptr, ldb, scratch_info()));
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs));
|
||||||
|
result.recordReady(stream_);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void init_context() {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||||
|
ensure_scratch(0); // allocate at least the info word
|
||||||
|
}
|
||||||
|
|
||||||
|
// Synchronize stream and interpret the info word. No-op if already synced.
|
||||||
|
void sync_info() {
|
||||||
|
if (!info_synced_) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||||
|
info_synced_ = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run cusolverDnXpotrf on d_factor_ (already on device).
|
||||||
|
// Enqueues factorization + async info download. Does NOT sync.
|
||||||
|
// Workspaces are stored as members to ensure they outlive the async kernels.
|
||||||
|
void factorize() {
|
||||||
|
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||||
|
constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_, ColMajor>::value;
|
||||||
|
|
||||||
|
info_synced_ = false;
|
||||||
|
info_ = InvalidInput;
|
||||||
|
|
||||||
|
size_t dev_ws_bytes = 0, host_ws_bytes = 0;
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(handle_, params_.p, uplo, static_cast<int64_t>(n_), dtype,
|
||||||
|
d_factor_.ptr, lda_, dtype, &dev_ws_bytes, &host_ws_bytes));
|
||||||
|
|
||||||
|
ensure_scratch(dev_ws_bytes);
|
||||||
|
h_workspace_.resize(host_ws_bytes);
|
||||||
|
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(
|
||||||
|
handle_, params_.p, uplo, static_cast<int64_t>(n_), dtype, d_factor_.ptr, lda_, dtype, scratch_workspace(),
|
||||||
|
dev_ws_bytes, host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info()));
|
||||||
|
|
||||||
|
// Enqueue async download of info word — sync deferred to info() or solve().
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_LLT_H
|
||||||
371
Eigen/src/GPU/GpuLU.h
Normal file
371
Eigen/src/GPU/GpuLU.h
Normal file
@@ -0,0 +1,371 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Eigen Authors
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU partial-pivoting LU decomposition using cuSOLVER.
|
||||||
|
//
|
||||||
|
// Wraps cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve).
|
||||||
|
// The factored LU matrix and pivot array are kept in device memory for the
|
||||||
|
// lifetime of the object, so repeated solves only transfer the RHS/solution.
|
||||||
|
//
|
||||||
|
// Requires CUDA 11.0+ (cusolverDnX generic API).
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuLU<double> lu(A); // upload A, getrf, LU+ipiv on device
|
||||||
|
// if (lu.info() != Success) { ... }
|
||||||
|
// MatrixXd x = lu.solve(b); // getrs NoTrans, only b transferred
|
||||||
|
// MatrixXd xt = lu.solve(b, GpuLU<double>::Transpose); // A^T x = b
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_LU_H
|
||||||
|
#define EIGEN_GPU_LU_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuSolverSupport.h"
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
/** \ingroup GPU_Module
|
||||||
|
* \class GpuLU
|
||||||
|
* \brief GPU LU decomposition with partial pivoting via cuSOLVER
|
||||||
|
*
|
||||||
|
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
|
||||||
|
*
|
||||||
|
* Decomposes a square matrix A = P L U on the GPU and retains the factored
|
||||||
|
* matrix and pivot array in device memory. Solves A*X=B, A^T*X=B, or
|
||||||
|
* A^H*X=B by passing the appropriate TransposeMode.
|
||||||
|
*
|
||||||
|
* Each GpuLU object owns a dedicated CUDA stream and cuSOLVER handle.
|
||||||
|
*/
|
||||||
|
template <typename Scalar_>
|
||||||
|
class GpuLU {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||||
|
|
||||||
|
/** Controls which system is solved in solve(). */
|
||||||
|
enum TransposeMode {
|
||||||
|
NoTranspose, ///< Solve A * X = B
|
||||||
|
Transpose, ///< Solve A^T * X = B
|
||||||
|
ConjugateTranspose ///< Solve A^H * X = B (same as Transpose for real types)
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- Construction / destruction ------------------------------------------
|
||||||
|
|
||||||
|
GpuLU() { init_context(); }
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
explicit GpuLU(const EigenBase<InputType>& A) {
|
||||||
|
init_context();
|
||||||
|
compute(A);
|
||||||
|
}
|
||||||
|
|
||||||
|
~GpuLU() {
|
||||||
|
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuLU(const GpuLU&) = delete;
|
||||||
|
GpuLU& operator=(const GpuLU&) = delete;
|
||||||
|
|
||||||
|
GpuLU(GpuLU&& o) noexcept
|
||||||
|
: stream_(o.stream_),
|
||||||
|
handle_(o.handle_),
|
||||||
|
params_(std::move(o.params_)),
|
||||||
|
d_lu_(std::move(o.d_lu_)),
|
||||||
|
lu_alloc_size_(o.lu_alloc_size_),
|
||||||
|
d_ipiv_(std::move(o.d_ipiv_)),
|
||||||
|
d_scratch_(std::move(o.d_scratch_)),
|
||||||
|
scratch_size_(o.scratch_size_),
|
||||||
|
h_workspace_(std::move(o.h_workspace_)),
|
||||||
|
n_(o.n_),
|
||||||
|
lda_(o.lda_),
|
||||||
|
info_(o.info_),
|
||||||
|
info_word_(o.info_word_),
|
||||||
|
info_synced_(o.info_synced_) {
|
||||||
|
o.stream_ = nullptr;
|
||||||
|
o.handle_ = nullptr;
|
||||||
|
o.lu_alloc_size_ = 0;
|
||||||
|
o.scratch_size_ = 0;
|
||||||
|
o.n_ = 0;
|
||||||
|
o.info_ = InvalidInput;
|
||||||
|
o.info_word_ = 0;
|
||||||
|
o.info_synced_ = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuLU& operator=(GpuLU&& o) noexcept {
|
||||||
|
if (this != &o) {
|
||||||
|
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
stream_ = o.stream_;
|
||||||
|
handle_ = o.handle_;
|
||||||
|
params_ = std::move(o.params_);
|
||||||
|
d_lu_ = std::move(o.d_lu_);
|
||||||
|
lu_alloc_size_ = o.lu_alloc_size_;
|
||||||
|
d_ipiv_ = std::move(o.d_ipiv_);
|
||||||
|
d_scratch_ = std::move(o.d_scratch_);
|
||||||
|
scratch_size_ = o.scratch_size_;
|
||||||
|
h_workspace_ = std::move(o.h_workspace_);
|
||||||
|
n_ = o.n_;
|
||||||
|
lda_ = o.lda_;
|
||||||
|
info_ = o.info_;
|
||||||
|
info_word_ = o.info_word_;
|
||||||
|
info_synced_ = o.info_synced_;
|
||||||
|
o.stream_ = nullptr;
|
||||||
|
o.handle_ = nullptr;
|
||||||
|
o.lu_alloc_size_ = 0;
|
||||||
|
o.scratch_size_ = 0;
|
||||||
|
o.n_ = 0;
|
||||||
|
o.info_ = InvalidInput;
|
||||||
|
o.info_word_ = 0;
|
||||||
|
o.info_synced_ = true;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Factorization -------------------------------------------------------
|
||||||
|
|
||||||
|
/** Compute the LU factorization of A (host matrix, must be square). */
|
||||||
|
template <typename InputType>
|
||||||
|
GpuLU& compute(const EigenBase<InputType>& A) {
|
||||||
|
eigen_assert(A.rows() == A.cols() && "GpuLU requires a square matrix");
|
||||||
|
if (!begin_compute(A.rows())) return *this;
|
||||||
|
|
||||||
|
const PlainMatrix mat(A.derived());
|
||||||
|
lda_ = static_cast<int64_t>(mat.rows());
|
||||||
|
allocate_lu_storage();
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Compute the LU factorization from a device-resident matrix (D2D copy). */
|
||||||
|
GpuLU& compute(const DeviceMatrix<Scalar>& d_A) {
|
||||||
|
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
|
||||||
|
if (!begin_compute(d_A.rows())) return *this;
|
||||||
|
|
||||||
|
lda_ = static_cast<int64_t>(d_A.rows());
|
||||||
|
d_A.waitReady(stream_);
|
||||||
|
allocate_lu_storage();
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.ptr, d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_));
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Compute the LU factorization from a device matrix (move, no copy). */
|
||||||
|
GpuLU& compute(DeviceMatrix<Scalar>&& d_A) {
|
||||||
|
eigen_assert(d_A.rows() == d_A.cols() && "GpuLU requires a square matrix");
|
||||||
|
if (!begin_compute(d_A.rows())) return *this;
|
||||||
|
|
||||||
|
lda_ = static_cast<int64_t>(d_A.rows());
|
||||||
|
d_A.waitReady(stream_);
|
||||||
|
d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve ---------------------------------------------------------------
|
||||||
|
|
||||||
|
/** Solve op(A) * X = B using the cached LU factorization (host → host).
|
||||||
|
*
|
||||||
|
* \param B Right-hand side (n x nrhs host matrix).
|
||||||
|
* \param mode NoTranspose (default), Transpose, or ConjugateTranspose.
|
||||||
|
*/
|
||||||
|
template <typename Rhs>
|
||||||
|
PlainMatrix solve(const MatrixBase<Rhs>& B, TransposeMode mode = NoTranspose) const {
|
||||||
|
const_cast<GpuLU*>(this)->sync_info();
|
||||||
|
eigen_assert(info_ == Success && "GpuLU::solve called on a failed or uninitialized factorization");
|
||||||
|
eigen_assert(B.rows() == n_);
|
||||||
|
|
||||||
|
const PlainMatrix rhs(B);
|
||||||
|
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||||
|
const int64_t ldb = static_cast<int64_t>(rhs.rows());
|
||||||
|
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_x_ptr, rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
|
||||||
|
});
|
||||||
|
|
||||||
|
PlainMatrix X(n_, B.cols());
|
||||||
|
int solve_info = 0;
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(X.data(), d_X.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
|
||||||
|
eigen_assert(solve_info == 0 && "cusolverDnXgetrs reported an error");
|
||||||
|
return X;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Solve op(A) * X = B with device-resident RHS. Fully async. */
|
||||||
|
DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B, TransposeMode mode = NoTranspose) const {
|
||||||
|
eigen_assert(d_B.rows() == n_);
|
||||||
|
d_B.waitReady(stream_);
|
||||||
|
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
||||||
|
const int64_t ldb = static_cast<int64_t>(d_B.rows());
|
||||||
|
return solve_impl(nrhs, ldb, mode, [&](Scalar* d_x_ptr) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_x_ptr, d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors -----------------------------------------------------------
|
||||||
|
|
||||||
|
/** Lazily synchronizes the stream on first call after compute(). */
|
||||||
|
ComputationInfo info() const {
|
||||||
|
const_cast<GpuLU*>(this)->sync_info();
|
||||||
|
return info_;
|
||||||
|
}
|
||||||
|
Index rows() const { return n_; }
|
||||||
|
Index cols() const { return n_; }
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cusolverDnHandle_t handle_ = nullptr;
|
||||||
|
internal::CusolverParams params_; // cuSOLVER params (created once, reused)
|
||||||
|
internal::DeviceBuffer d_lu_; // LU factors on device (grows, never shrinks)
|
||||||
|
size_t lu_alloc_size_ = 0; // current d_lu_ allocation size
|
||||||
|
internal::DeviceBuffer d_ipiv_; // pivot indices (int64_t) on device
|
||||||
|
internal::DeviceBuffer d_scratch_; // combined workspace + info word (grows, never shrinks)
|
||||||
|
size_t scratch_size_ = 0; // current scratch allocation size
|
||||||
|
std::vector<char> h_workspace_; // host workspace (kept alive until next compute)
|
||||||
|
Index n_ = 0;
|
||||||
|
int64_t lda_ = 0;
|
||||||
|
ComputationInfo info_ = InvalidInput;
|
||||||
|
int info_word_ = 0; // host-side target for async info download
|
||||||
|
bool info_synced_ = true; // has the stream been synced for info?
|
||||||
|
|
||||||
|
bool begin_compute(Index rows) {
|
||||||
|
n_ = rows;
|
||||||
|
info_ = InvalidInput;
|
||||||
|
if (n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t matrixBytes() const { return matrixBytes(static_cast<int64_t>(n_), lda_); }
|
||||||
|
|
||||||
|
static size_t matrixBytes(int64_t cols, int64_t outer_stride) {
|
||||||
|
return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
|
||||||
|
}
|
||||||
|
|
||||||
|
void allocate_lu_storage() {
|
||||||
|
size_t needed = matrixBytes();
|
||||||
|
if (needed > lu_alloc_size_) {
|
||||||
|
d_lu_ = internal::DeviceBuffer(needed);
|
||||||
|
lu_alloc_size_ = needed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure d_scratch_ is at least `workspace_bytes + sizeof(int)`.
|
||||||
|
// Layout: [workspace (workspace_bytes) | info_word (sizeof(int))].
|
||||||
|
// Ensure d_scratch_ can hold workspace_bytes + an aligned info word.
|
||||||
|
// Grows but never shrinks. Syncs the stream before reallocating to
|
||||||
|
// avoid freeing memory that async kernels may still be using.
|
||||||
|
void ensure_scratch(size_t workspace_bytes) {
|
||||||
|
constexpr size_t kAlign = 16;
|
||||||
|
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||||
|
size_t needed = workspace_bytes + sizeof(int);
|
||||||
|
if (needed > scratch_size_) {
|
||||||
|
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
d_scratch_ = internal::DeviceBuffer(needed);
|
||||||
|
scratch_size_ = needed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||||
|
int* scratch_info() const {
|
||||||
|
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename CopyRhs>
|
||||||
|
DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, TransposeMode mode, CopyRhs&& copy_rhs) const {
|
||||||
|
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||||
|
const cublasOperation_t trans = to_cublas_op(mode);
|
||||||
|
|
||||||
|
Scalar* d_x_ptr = nullptr;
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_x_ptr), matrixBytes(nrhs, ldb)));
|
||||||
|
copy_rhs(d_x_ptr);
|
||||||
|
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(handle_, params_.p, trans, static_cast<int64_t>(n_), nrhs, dtype, d_lu_.ptr,
|
||||||
|
lda_, static_cast<const int64_t*>(d_ipiv_.ptr), dtype, d_x_ptr, ldb,
|
||||||
|
scratch_info()));
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> result(d_x_ptr, n_, static_cast<Index>(nrhs));
|
||||||
|
result.recordReady(stream_);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void init_context() {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||||
|
ensure_scratch(0); // allocate at least the info word
|
||||||
|
}
|
||||||
|
|
||||||
|
void sync_info() {
|
||||||
|
if (!info_synced_) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||||
|
info_synced_ = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run cusolverDnXgetrf on d_lu_ (already on device). Allocates d_ipiv_.
|
||||||
|
// Enqueues factorization + async info download. Does NOT sync.
|
||||||
|
// Workspaces are stored as members to ensure they outlive the async kernels.
|
||||||
|
void factorize() {
|
||||||
|
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||||
|
const size_t ipiv_bytes = static_cast<size_t>(n_) * sizeof(int64_t);
|
||||||
|
|
||||||
|
info_synced_ = false;
|
||||||
|
info_ = InvalidInput;
|
||||||
|
|
||||||
|
d_ipiv_ = internal::DeviceBuffer(ipiv_bytes);
|
||||||
|
|
||||||
|
size_t dev_ws_bytes = 0, host_ws_bytes = 0;
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(handle_, params_.p, static_cast<int64_t>(n_),
|
||||||
|
static_cast<int64_t>(n_), dtype, d_lu_.ptr, lda_, dtype,
|
||||||
|
&dev_ws_bytes, &host_ws_bytes));
|
||||||
|
|
||||||
|
ensure_scratch(dev_ws_bytes);
|
||||||
|
h_workspace_.resize(host_ws_bytes);
|
||||||
|
|
||||||
|
EIGEN_CUSOLVER_CHECK(
|
||||||
|
cusolverDnXgetrf(handle_, params_.p, static_cast<int64_t>(n_), static_cast<int64_t>(n_), dtype, d_lu_.ptr, lda_,
|
||||||
|
static_cast<int64_t*>(d_ipiv_.ptr), dtype, scratch_workspace(), dev_ws_bytes,
|
||||||
|
host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info()));
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
static cublasOperation_t to_cublas_op(TransposeMode mode) {
|
||||||
|
switch (mode) {
|
||||||
|
case Transpose:
|
||||||
|
return CUBLAS_OP_T;
|
||||||
|
case ConjugateTranspose:
|
||||||
|
return CUBLAS_OP_C;
|
||||||
|
default:
|
||||||
|
return CUBLAS_OP_N;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_LU_H
|
||||||
389
Eigen/src/GPU/GpuQR.h
Normal file
389
Eigen/src/GPU/GpuQR.h
Normal file
@@ -0,0 +1,389 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU QR decomposition using cuSOLVER.
|
||||||
|
//
|
||||||
|
// Wraps cusolverDnXgeqrf (factorization), cusolverDnXormqr (apply Q),
|
||||||
|
// cusolverDnXorgqr (form Q), and cublasXtrsm (triangular solve on R).
|
||||||
|
//
|
||||||
|
// The factored matrix (reflectors + R) and tau stay in device memory.
|
||||||
|
// Solve uses ormqr + trsm without forming Q explicitly.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuQR<double> qr(A); // upload A, geqrf
|
||||||
|
// if (qr.info() != Success) { ... }
|
||||||
|
// MatrixXd X = qr.solve(B); // Q^H * B via ormqr, then trsm on R
|
||||||
|
//
|
||||||
|
// Expression syntax:
|
||||||
|
// d_X = d_A.qr().solve(d_B); // temporary, no caching
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_QR_H
|
||||||
|
#define EIGEN_GPU_QR_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuSolverSupport.h"
|
||||||
|
#include "./CuBlasSupport.h"
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class GpuQR {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||||
|
|
||||||
|
GpuQR() { init_context(); }
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
explicit GpuQR(const EigenBase<InputType>& A) {
|
||||||
|
init_context();
|
||||||
|
compute(A);
|
||||||
|
}
|
||||||
|
|
||||||
|
~GpuQR() {
|
||||||
|
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||||
|
if (cublas_) (void)cublasDestroy(cublas_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuQR(const GpuQR&) = delete;
|
||||||
|
GpuQR& operator=(const GpuQR&) = delete;
|
||||||
|
|
||||||
|
GpuQR(GpuQR&& o) noexcept
|
||||||
|
: stream_(o.stream_),
|
||||||
|
handle_(o.handle_),
|
||||||
|
cublas_(o.cublas_),
|
||||||
|
params_(std::move(o.params_)),
|
||||||
|
d_qr_(std::move(o.d_qr_)),
|
||||||
|
d_tau_(std::move(o.d_tau_)),
|
||||||
|
d_scratch_(std::move(o.d_scratch_)),
|
||||||
|
scratch_size_(o.scratch_size_),
|
||||||
|
h_workspace_(std::move(o.h_workspace_)),
|
||||||
|
m_(o.m_),
|
||||||
|
n_(o.n_),
|
||||||
|
lda_(o.lda_),
|
||||||
|
info_(o.info_),
|
||||||
|
info_word_(o.info_word_),
|
||||||
|
info_synced_(o.info_synced_) {
|
||||||
|
o.stream_ = nullptr;
|
||||||
|
o.handle_ = nullptr;
|
||||||
|
o.cublas_ = nullptr;
|
||||||
|
o.scratch_size_ = 0;
|
||||||
|
o.m_ = 0;
|
||||||
|
o.n_ = 0;
|
||||||
|
o.lda_ = 0;
|
||||||
|
o.info_ = InvalidInput;
|
||||||
|
o.info_word_ = 0;
|
||||||
|
o.info_synced_ = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuQR& operator=(GpuQR&& o) noexcept {
|
||||||
|
if (this != &o) {
|
||||||
|
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||||
|
if (cublas_) (void)cublasDestroy(cublas_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
stream_ = o.stream_;
|
||||||
|
handle_ = o.handle_;
|
||||||
|
cublas_ = o.cublas_;
|
||||||
|
params_ = std::move(o.params_);
|
||||||
|
d_qr_ = std::move(o.d_qr_);
|
||||||
|
d_tau_ = std::move(o.d_tau_);
|
||||||
|
d_scratch_ = std::move(o.d_scratch_);
|
||||||
|
scratch_size_ = o.scratch_size_;
|
||||||
|
h_workspace_ = std::move(o.h_workspace_);
|
||||||
|
m_ = o.m_;
|
||||||
|
n_ = o.n_;
|
||||||
|
lda_ = o.lda_;
|
||||||
|
info_ = o.info_;
|
||||||
|
info_word_ = o.info_word_;
|
||||||
|
info_synced_ = o.info_synced_;
|
||||||
|
o.stream_ = nullptr;
|
||||||
|
o.handle_ = nullptr;
|
||||||
|
o.cublas_ = nullptr;
|
||||||
|
o.scratch_size_ = 0;
|
||||||
|
o.m_ = 0;
|
||||||
|
o.n_ = 0;
|
||||||
|
o.lda_ = 0;
|
||||||
|
o.info_ = InvalidInput;
|
||||||
|
o.info_word_ = 0;
|
||||||
|
o.info_synced_ = true;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Factorization -------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
GpuQR& compute(const EigenBase<InputType>& A) {
|
||||||
|
m_ = A.rows();
|
||||||
|
n_ = A.cols();
|
||||||
|
info_ = InvalidInput;
|
||||||
|
info_synced_ = false;
|
||||||
|
|
||||||
|
if (m_ == 0 || n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
info_synced_ = true;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
const PlainMatrix mat(A.derived());
|
||||||
|
lda_ = static_cast<int64_t>(mat.rows());
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||||
|
const size_t tau_bytes = static_cast<size_t>((std::min)(m_, n_)) * sizeof(Scalar);
|
||||||
|
|
||||||
|
d_qr_ = internal::DeviceBuffer(mat_bytes);
|
||||||
|
d_tau_ = internal::DeviceBuffer(tau_bytes);
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_qr_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuQR& compute(const DeviceMatrix<Scalar>& d_A) {
|
||||||
|
m_ = d_A.rows();
|
||||||
|
n_ = d_A.cols();
|
||||||
|
info_ = InvalidInput;
|
||||||
|
info_synced_ = false;
|
||||||
|
|
||||||
|
if (m_ == 0 || n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
info_synced_ = true;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
lda_ = static_cast<int64_t>(d_A.rows());
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||||
|
const size_t tau_bytes = static_cast<size_t>((std::min)(m_, n_)) * sizeof(Scalar);
|
||||||
|
|
||||||
|
d_A.waitReady(stream_);
|
||||||
|
d_qr_ = internal::DeviceBuffer(mat_bytes);
|
||||||
|
d_tau_ = internal::DeviceBuffer(tau_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_qr_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve ---------------------------------------------------------------
|
||||||
|
|
||||||
|
/** Solve A * X = B via QR: X = R^{-1} * Q^H * B (least-squares for m >= n).
|
||||||
|
* Uses ormqr (apply Q^H) + trsm (solve R), without forming Q explicitly.
|
||||||
|
* Requires m >= n (overdetermined or square). Underdetermined not supported.
|
||||||
|
*
|
||||||
|
* TODO: Add device-side accessor for the R factor (and Q application) as
|
||||||
|
* DeviceMatrix, so users can chain GPU operations without host round-trips. */
|
||||||
|
template <typename Rhs>
|
||||||
|
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success && "GpuQR::solve called on a failed or uninitialized factorization");
|
||||||
|
eigen_assert(B.rows() == m_);
|
||||||
|
eigen_assert(m_ >= n_ && "GpuQR::solve requires m >= n (use SVD for underdetermined systems)");
|
||||||
|
|
||||||
|
const PlainMatrix rhs(B);
|
||||||
|
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||||
|
const int64_t ldb = static_cast<int64_t>(rhs.rows()); // = m_
|
||||||
|
const size_t b_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||||
|
|
||||||
|
// Upload B to device (m × nrhs buffer).
|
||||||
|
internal::DeviceBuffer d_B(b_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.ptr, rhs.data(), b_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
// Apply Q^H to B in-place: d_B becomes m × nrhs, first n rows hold Q^H * B relevant part.
|
||||||
|
apply_QH(d_B.ptr, ldb, nrhs);
|
||||||
|
|
||||||
|
// Solve R * X = (Q^H * B)[0:n,:] via trsm on the first n rows.
|
||||||
|
Scalar alpha(1);
|
||||||
|
EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
|
||||||
|
CUBLAS_DIAG_NON_UNIT, static_cast<int>(n_), static_cast<int>(nrhs), &alpha,
|
||||||
|
static_cast<const Scalar*>(d_qr_.ptr), static_cast<int>(lda_),
|
||||||
|
static_cast<Scalar*>(d_B.ptr), static_cast<int>(ldb)));
|
||||||
|
|
||||||
|
// Download the first n rows of each column (stride = ldb = m, width = n).
|
||||||
|
PlainMatrix X(n_, rhs.cols());
|
||||||
|
if (m_ == n_) {
|
||||||
|
// Square: dense copy, no stride mismatch.
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_B.ptr,
|
||||||
|
static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost, stream_));
|
||||||
|
} else {
|
||||||
|
// Overdetermined: 2D copy to extract first n rows from each column.
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(
|
||||||
|
X.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_B.ptr, static_cast<size_t>(ldb) * sizeof(Scalar),
|
||||||
|
static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
}
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
return X;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Solve with device-resident RHS. Returns n × nrhs DeviceMatrix. */
|
||||||
|
DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success && "GpuQR::solve called on a failed or uninitialized factorization");
|
||||||
|
eigen_assert(d_B.rows() == m_);
|
||||||
|
eigen_assert(m_ >= n_ && "GpuQR::solve requires m >= n (use SVD for underdetermined systems)");
|
||||||
|
d_B.waitReady(stream_);
|
||||||
|
|
||||||
|
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
|
||||||
|
const int64_t ldb = static_cast<int64_t>(d_B.rows()); // = m_
|
||||||
|
const size_t b_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||||
|
|
||||||
|
// D2D copy B into working buffer (ormqr and trsm are in-place).
|
||||||
|
internal::DeviceBuffer d_work(b_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_work.ptr, d_B.data(), b_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||||
|
|
||||||
|
apply_QH(d_work.ptr, ldb, nrhs);
|
||||||
|
|
||||||
|
// trsm on the first n rows.
|
||||||
|
Scalar alpha(1);
|
||||||
|
EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
|
||||||
|
CUBLAS_DIAG_NON_UNIT, static_cast<int>(n_), static_cast<int>(nrhs), &alpha,
|
||||||
|
static_cast<const Scalar*>(d_qr_.ptr), static_cast<int>(lda_),
|
||||||
|
static_cast<Scalar*>(d_work.ptr), static_cast<int>(ldb)));
|
||||||
|
|
||||||
|
if (m_ == n_) {
|
||||||
|
// Square: result is the whole buffer, dense.
|
||||||
|
DeviceMatrix<Scalar> result(static_cast<Scalar*>(d_work.ptr), n_, static_cast<Index>(nrhs));
|
||||||
|
d_work.ptr = nullptr; // transfer ownership
|
||||||
|
result.recordReady(stream_);
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
// Overdetermined: copy first n rows of each column into a dense n × nrhs result.
|
||||||
|
DeviceMatrix<Scalar> result(n_, static_cast<Index>(nrhs));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(result.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_work.ptr,
|
||||||
|
static_cast<size_t>(ldb) * sizeof(Scalar),
|
||||||
|
static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs),
|
||||||
|
cudaMemcpyDeviceToDevice, stream_));
|
||||||
|
result.recordReady(stream_);
|
||||||
|
return result;
|
||||||
|
// d_work freed here via RAII — safe because stream is ordered.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors -----------------------------------------------------------
|
||||||
|
|
||||||
|
ComputationInfo info() const {
|
||||||
|
sync_info();
|
||||||
|
return info_;
|
||||||
|
}
|
||||||
|
|
||||||
|
Index rows() const { return m_; }
|
||||||
|
Index cols() const { return n_; }
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cusolverDnHandle_t handle_ = nullptr;
|
||||||
|
cublasHandle_t cublas_ = nullptr;
|
||||||
|
internal::CusolverParams params_;
|
||||||
|
internal::DeviceBuffer d_qr_; // QR factors (reflectors in lower, R in upper)
|
||||||
|
internal::DeviceBuffer d_tau_; // Householder scalars (min(m,n))
|
||||||
|
internal::DeviceBuffer d_scratch_; // workspace + info word
|
||||||
|
size_t scratch_size_ = 0;
|
||||||
|
std::vector<char> h_workspace_;
|
||||||
|
Index m_ = 0;
|
||||||
|
Index n_ = 0;
|
||||||
|
int64_t lda_ = 0;
|
||||||
|
ComputationInfo info_ = InvalidInput;
|
||||||
|
int info_word_ = 0;
|
||||||
|
bool info_synced_ = true;
|
||||||
|
|
||||||
|
void init_context() {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
|
||||||
|
ensure_scratch(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ensure_scratch(size_t workspace_bytes) {
|
||||||
|
constexpr size_t kAlign = 16;
|
||||||
|
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||||
|
size_t needed = workspace_bytes + sizeof(int);
|
||||||
|
if (needed > scratch_size_) {
|
||||||
|
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
d_scratch_ = internal::DeviceBuffer(needed);
|
||||||
|
scratch_size_ = needed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||||
|
int* scratch_info() const {
|
||||||
|
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||||
|
}
|
||||||
|
|
||||||
|
void sync_info() const {
|
||||||
|
if (!info_synced_) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
const_cast<GpuQR*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||||
|
const_cast<GpuQR*>(this)->info_synced_ = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void factorize() {
|
||||||
|
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||||
|
|
||||||
|
info_synced_ = false;
|
||||||
|
info_ = InvalidInput;
|
||||||
|
|
||||||
|
size_t dev_ws = 0, host_ws = 0;
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf_bufferSize(handle_, params_.p, static_cast<int64_t>(m_),
|
||||||
|
static_cast<int64_t>(n_), dtype, d_qr_.ptr, lda_, dtype,
|
||||||
|
d_tau_.ptr, dtype, &dev_ws, &host_ws));
|
||||||
|
|
||||||
|
ensure_scratch(dev_ws);
|
||||||
|
h_workspace_.resize(host_ws);
|
||||||
|
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf(handle_, params_.p, static_cast<int64_t>(m_), static_cast<int64_t>(n_), dtype,
|
||||||
|
d_qr_.ptr, lda_, dtype, d_tau_.ptr, dtype, scratch_workspace(), dev_ws,
|
||||||
|
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply Q^H to a device buffer in-place: d_B = Q^H * d_B.
|
||||||
|
// Uses type-specific ormqr (real) or unmqr (complex) wrappers from CuSolverSupport.h.
|
||||||
|
// For real types: Q^H = Q^T, use CUBLAS_OP_T. For complex: use CUBLAS_OP_C.
|
||||||
|
void apply_QH(void* d_B, int64_t ldb, int64_t nrhs) const {
|
||||||
|
const int im = static_cast<int>(m_);
|
||||||
|
const int in = static_cast<int>(nrhs);
|
||||||
|
const int ik = static_cast<int>((std::min)(m_, n_));
|
||||||
|
const int ilda = static_cast<int>(lda_);
|
||||||
|
const int ildb = static_cast<int>(ldb);
|
||||||
|
constexpr cublasOperation_t trans = NumTraits<Scalar>::IsComplex ? CUBLAS_OP_C : CUBLAS_OP_T;
|
||||||
|
|
||||||
|
int lwork = 0;
|
||||||
|
EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr_bufferSize(
|
||||||
|
handle_, CUBLAS_SIDE_LEFT, trans, im, in, ik, static_cast<const Scalar*>(d_qr_.ptr), ilda,
|
||||||
|
static_cast<const Scalar*>(d_tau_.ptr), static_cast<const Scalar*>(d_B), ildb, &lwork));
|
||||||
|
|
||||||
|
internal::DeviceBuffer d_work(static_cast<size_t>(lwork) * sizeof(Scalar));
|
||||||
|
|
||||||
|
EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr(handle_, CUBLAS_SIDE_LEFT, trans, im, in, ik,
|
||||||
|
static_cast<const Scalar*>(d_qr_.ptr), ilda,
|
||||||
|
static_cast<const Scalar*>(d_tau_.ptr), static_cast<Scalar*>(d_B),
|
||||||
|
ildb, static_cast<Scalar*>(d_work.ptr), lwork, scratch_info()));
|
||||||
|
|
||||||
|
// Sync to ensure workspace can be freed safely, and check ormqr info.
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
int ormqr_info = 0;
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(&ormqr_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost));
|
||||||
|
eigen_assert(ormqr_info == 0 && "cusolverDnXormqr reported an error");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_QR_H
|
||||||
495
Eigen/src/GPU/GpuSVD.h
Normal file
495
Eigen/src/GPU/GpuSVD.h
Normal file
@@ -0,0 +1,495 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU SVD decomposition using cuSOLVER (divide-and-conquer).
|
||||||
|
//
|
||||||
|
// Wraps cusolverDnXgesvd. Stores U, S, VT on device. Solve uses
|
||||||
|
// cuBLAS GEMM: X = VT^H * diag(D) * U^H * B.
|
||||||
|
//
|
||||||
|
// cuSOLVER returns VT (not V). We store and expose VT directly.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuSVD<double> svd(A, ComputeThinU | ComputeThinV);
|
||||||
|
// VectorXd S = svd.singularValues();
|
||||||
|
// MatrixXd U = svd.matrixU(); // m×k or m×m
|
||||||
|
// MatrixXd V = svd.matrixV(); // n×k or n×n (matches JacobiSVD)
|
||||||
|
// MatrixXd VT = svd.matrixVT(); // k×n or n×n (this is V^T)
|
||||||
|
// MatrixXd X = svd.solve(B); // pseudoinverse
|
||||||
|
// MatrixXd X = svd.solve(B, k); // truncated (top k triplets)
|
||||||
|
// MatrixXd X = svd.solve(B, 0.1); // Tikhonov regularized
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_SVD_H
|
||||||
|
#define EIGEN_GPU_SVD_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuSolverSupport.h"
|
||||||
|
#include "./CuBlasSupport.h"
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class GpuSVD {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||||
|
using RealVector = Matrix<RealScalar, Dynamic, 1>;
|
||||||
|
|
||||||
|
GpuSVD() { init_context(); }
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
explicit GpuSVD(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
|
||||||
|
init_context();
|
||||||
|
compute(A, options);
|
||||||
|
}
|
||||||
|
|
||||||
|
~GpuSVD() {
|
||||||
|
if (handle_) (void)cusolverDnDestroy(handle_);
|
||||||
|
if (cublas_lt_) (void)cublasLtDestroy(cublas_lt_);
|
||||||
|
if (cublas_) (void)cublasDestroy(cublas_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuSVD(const GpuSVD&) = delete;
|
||||||
|
GpuSVD& operator=(const GpuSVD&) = delete;
|
||||||
|
// Move constructors omitted for brevity — follow GpuQR pattern.
|
||||||
|
|
||||||
|
// ---- Factorization -------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
GpuSVD& compute(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
|
||||||
|
options_ = options;
|
||||||
|
m_ = A.rows();
|
||||||
|
n_ = A.cols();
|
||||||
|
info_ = InvalidInput;
|
||||||
|
info_synced_ = false;
|
||||||
|
|
||||||
|
if (m_ == 0 || n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
info_synced_ = true;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// cuSOLVER gesvd requires m >= n. For wide matrices, transpose internally.
|
||||||
|
transposed_ = (m_ < n_);
|
||||||
|
const PlainMatrix mat = transposed_ ? PlainMatrix(A.derived().adjoint()) : PlainMatrix(A.derived());
|
||||||
|
if (transposed_) std::swap(m_, n_);
|
||||||
|
|
||||||
|
lda_ = static_cast<int64_t>(mat.rows());
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||||
|
|
||||||
|
// Copy (possibly transposed) A to device (gesvd overwrites it).
|
||||||
|
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, mat.data(), mat_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuSVD& compute(const DeviceMatrix<Scalar>& d_A, unsigned int options = ComputeThinU | ComputeThinV) {
|
||||||
|
options_ = options;
|
||||||
|
m_ = d_A.rows();
|
||||||
|
n_ = d_A.cols();
|
||||||
|
info_ = InvalidInput;
|
||||||
|
info_synced_ = false;
|
||||||
|
|
||||||
|
if (m_ == 0 || n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
info_synced_ = true;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
transposed_ = (m_ < n_);
|
||||||
|
d_A.waitReady(stream_);
|
||||||
|
|
||||||
|
if (transposed_) {
|
||||||
|
// Transpose on device via cuBLAS geam: d_A_ = A^H.
|
||||||
|
std::swap(m_, n_);
|
||||||
|
lda_ = m_;
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||||
|
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||||
|
Scalar alpha_one(1), beta_zero(0);
|
||||||
|
// geam: C(m×n) = alpha * op(A) + beta * op(B). Use B = nullptr trick: beta=0.
|
||||||
|
// A is the original d_A (n_orig × m_orig = n × m after swap), transposed → m × n.
|
||||||
|
EIGEN_CUBLAS_CHECK(internal::cublasXgeam(
|
||||||
|
cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(m_), static_cast<int>(n_), &alpha_one, d_A.data(),
|
||||||
|
static_cast<int>(d_A.rows()), &beta_zero, static_cast<const Scalar*>(nullptr), static_cast<int>(m_),
|
||||||
|
static_cast<Scalar*>(d_A_.ptr), static_cast<int>(m_)));
|
||||||
|
} else {
|
||||||
|
lda_ = static_cast<int64_t>(d_A.rows());
|
||||||
|
const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
|
||||||
|
d_A_ = internal::DeviceBuffer(mat_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_A_.ptr, d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
factorize();
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors -----------------------------------------------------------
|
||||||
|
|
||||||
|
ComputationInfo info() const {
|
||||||
|
sync_info();
|
||||||
|
return info_;
|
||||||
|
}
|
||||||
|
|
||||||
|
Index rows() const { return transposed_ ? n_ : m_; }
|
||||||
|
Index cols() const { return transposed_ ? m_ : n_; }
|
||||||
|
|
||||||
|
// TODO: Add device-side accessors (deviceU(), deviceVT(), deviceSingularValues())
|
||||||
|
// returning DeviceMatrix views of the internal buffers, so users can chain
|
||||||
|
// GPU operations without round-tripping through host memory.
|
||||||
|
|
||||||
|
/** Singular values (always available). Downloads from device on each call. */
|
||||||
|
RealVector singularValues() const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success);
|
||||||
|
const Index k = (std::min)(m_, n_);
|
||||||
|
RealVector S(k);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpy(S.data(), d_S_.ptr, static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
|
||||||
|
return S;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Left singular vectors U. Returns m_orig × k or m_orig × m_orig.
|
||||||
|
* For transposed case (m_orig < n_orig), U comes from cuSOLVER's VT. */
|
||||||
|
PlainMatrix matrixU() const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success);
|
||||||
|
eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "matrixU() requires ComputeThinU or ComputeFullU");
|
||||||
|
const Index m_orig = transposed_ ? n_ : m_;
|
||||||
|
const Index n_orig = transposed_ ? m_ : n_;
|
||||||
|
const Index k = (std::min)(m_orig, n_orig);
|
||||||
|
if (!transposed_) {
|
||||||
|
const Index ucols = (options_ & ComputeFullU) ? m_ : k;
|
||||||
|
PlainMatrix U(m_, ucols);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U.data(), d_U_.ptr,
|
||||||
|
static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
return U;
|
||||||
|
} else {
|
||||||
|
// Transposed: U_orig = VT_stored^H. VT_stored is vtrows × n_ (= vtrows × m_orig).
|
||||||
|
const Index vtrows = (options_ & ComputeFullU) ? m_orig : k; // Note: FullU maps to FullV of A^H
|
||||||
|
PlainMatrix VT_stored(vtrows, n_);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT_stored.data(), d_VT_.ptr,
|
||||||
|
static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
return VT_stored.adjoint(); // m_orig × vtrows
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Right singular vectors V. Returns n_orig × k or n_orig × n_orig.
|
||||||
|
* Equivalent to matrixVT().adjoint(). Matches Eigen's JacobiSVD::matrixV() API. */
|
||||||
|
PlainMatrix matrixV() const { return matrixVT().adjoint(); }
|
||||||
|
|
||||||
|
/** Right singular vectors transposed V^T. Returns k × n_orig or n_orig × n_orig.
|
||||||
|
* For transposed case, VT comes from cuSOLVER's U. */
|
||||||
|
PlainMatrix matrixVT() const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success);
|
||||||
|
eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "matrixVT() requires ComputeThinV or ComputeFullV");
|
||||||
|
const Index m_orig = transposed_ ? n_ : m_;
|
||||||
|
const Index n_orig = transposed_ ? m_ : n_;
|
||||||
|
const Index k = (std::min)(m_orig, n_orig);
|
||||||
|
if (!transposed_) {
|
||||||
|
const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
|
||||||
|
PlainMatrix VT(vtrows, n_);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT.data(), d_VT_.ptr,
|
||||||
|
static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
return VT;
|
||||||
|
} else {
|
||||||
|
// Transposed: VT_orig = U_stored^H. U_stored is m_ × ucols (= n_orig × ucols).
|
||||||
|
const Index ucols = (options_ & ComputeFullV) ? n_orig : k; // FullV maps to FullU of A^H
|
||||||
|
PlainMatrix U_stored(m_, ucols);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U_stored.data(), d_U_.ptr,
|
||||||
|
static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
return U_stored.adjoint(); // ucols × n_orig
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Number of singular values above threshold. */
|
||||||
|
Index rank(RealScalar threshold = RealScalar(-1)) const {
|
||||||
|
RealVector S = singularValues();
|
||||||
|
if (S.size() == 0) return 0;
|
||||||
|
if (threshold < 0) {
|
||||||
|
threshold = (std::max)(m_, n_) * S(0) * NumTraits<RealScalar>::epsilon();
|
||||||
|
}
|
||||||
|
Index r = 0;
|
||||||
|
for (Index i = 0; i < S.size(); ++i) {
|
||||||
|
if (S(i) > threshold) ++r;
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve ---------------------------------------------------------------
|
||||||
|
|
||||||
|
/** Pseudoinverse solve: X = V * diag(1/S) * U^H * B. */
|
||||||
|
template <typename Rhs>
|
||||||
|
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
|
||||||
|
return solve_impl(B, (std::min)(m_, n_), RealScalar(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Truncated solve: use only top trunc singular triplets. */
|
||||||
|
template <typename Rhs>
|
||||||
|
PlainMatrix solve(const MatrixBase<Rhs>& B, Index trunc) const {
|
||||||
|
eigen_assert(trunc > 0 && trunc <= (std::min)(m_, n_));
|
||||||
|
return solve_impl(B, trunc, RealScalar(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Tikhonov-regularized solve: D_ii = S_i / (S_i^2 + lambda^2). */
|
||||||
|
template <typename Rhs>
|
||||||
|
PlainMatrix solve(const MatrixBase<Rhs>& B, RealScalar lambda) const {
|
||||||
|
eigen_assert(lambda > 0);
|
||||||
|
return solve_impl(B, (std::min)(m_, n_), lambda);
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cusolverDnHandle_t handle_ = nullptr;
|
||||||
|
cublasHandle_t cublas_ = nullptr;
|
||||||
|
cublasLtHandle_t cublas_lt_ = nullptr;
|
||||||
|
mutable internal::DeviceBuffer gemm_workspace_;
|
||||||
|
internal::CusolverParams params_;
|
||||||
|
internal::DeviceBuffer d_A_; // working copy of A (overwritten by gesvd)
|
||||||
|
internal::DeviceBuffer d_U_; // left singular vectors
|
||||||
|
internal::DeviceBuffer d_S_; // singular values (RealScalar)
|
||||||
|
internal::DeviceBuffer d_VT_; // right singular vectors transposed
|
||||||
|
internal::DeviceBuffer d_scratch_; // workspace + info
|
||||||
|
size_t scratch_size_ = 0;
|
||||||
|
std::vector<char> h_workspace_;
|
||||||
|
unsigned int options_ = 0;
|
||||||
|
Index m_ = 0;
|
||||||
|
Index n_ = 0;
|
||||||
|
int64_t lda_ = 0;
|
||||||
|
bool transposed_ = false; // true if m < n (we compute SVD of A^T internally)
|
||||||
|
ComputationInfo info_ = InvalidInput;
|
||||||
|
int info_word_ = 0;
|
||||||
|
bool info_synced_ = true;
|
||||||
|
|
||||||
|
void init_context() {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
|
||||||
|
EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
|
||||||
|
EIGEN_CUBLASLT_CHECK(cublasLtCreate(&cublas_lt_));
|
||||||
|
ensure_scratch(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ensure_scratch(size_t workspace_bytes) {
|
||||||
|
constexpr size_t kAlign = 16;
|
||||||
|
workspace_bytes = (workspace_bytes + kAlign - 1) & ~(kAlign - 1);
|
||||||
|
size_t needed = workspace_bytes + sizeof(int);
|
||||||
|
if (needed > scratch_size_) {
|
||||||
|
if (d_scratch_.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
d_scratch_ = internal::DeviceBuffer(needed);
|
||||||
|
scratch_size_ = needed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void* scratch_workspace() const { return d_scratch_.ptr; }
|
||||||
|
int* scratch_info() const {
|
||||||
|
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.ptr) + scratch_size_ - sizeof(int));
|
||||||
|
}
|
||||||
|
|
||||||
|
void sync_info() const {
|
||||||
|
if (!info_synced_) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
const_cast<GpuSVD*>(this)->info_ = (info_word_ == 0) ? Success : NumericalIssue;
|
||||||
|
const_cast<GpuSVD*>(this)->info_synced_ = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Swap U↔V flags for the transposed case.
|
||||||
|
static unsigned int swap_uv_options(unsigned int opts) {
|
||||||
|
unsigned int result = 0;
|
||||||
|
if (opts & ComputeThinU) result |= ComputeThinV;
|
||||||
|
if (opts & ComputeFullU) result |= ComputeFullV;
|
||||||
|
if (opts & ComputeThinV) result |= ComputeThinU;
|
||||||
|
if (opts & ComputeFullV) result |= ComputeFullU;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static signed char jobu(unsigned int opts) {
|
||||||
|
if (opts & ComputeFullU) return 'A';
|
||||||
|
if (opts & ComputeThinU) return 'S';
|
||||||
|
return 'N';
|
||||||
|
}
|
||||||
|
|
||||||
|
static signed char jobvt(unsigned int opts) {
|
||||||
|
if (opts & ComputeFullV) return 'A';
|
||||||
|
if (opts & ComputeThinV) return 'S';
|
||||||
|
return 'N';
|
||||||
|
}
|
||||||
|
|
||||||
|
void factorize() {
|
||||||
|
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
|
||||||
|
constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
|
||||||
|
const Index k = (std::min)(m_, n_);
|
||||||
|
|
||||||
|
info_synced_ = false;
|
||||||
|
info_ = InvalidInput;
|
||||||
|
|
||||||
|
// Allocate output buffers. When transposed, swap U/V roles for cuSOLVER.
|
||||||
|
d_S_ = internal::DeviceBuffer(static_cast<size_t>(k) * sizeof(RealScalar));
|
||||||
|
|
||||||
|
// Internal options: for transposed case, what user wants as U we compute as VT of A^H.
|
||||||
|
const unsigned int int_opts = transposed_ ? swap_uv_options(options_) : options_;
|
||||||
|
|
||||||
|
const Index ucols = (int_opts & ComputeFullU) ? m_ : ((int_opts & ComputeThinU) ? k : 0);
|
||||||
|
const Index vtrows = (int_opts & ComputeFullV) ? n_ : ((int_opts & ComputeThinV) ? k : 0);
|
||||||
|
const int64_t ldu = m_;
|
||||||
|
const int64_t ldvt = vtrows > 0 ? vtrows : 1;
|
||||||
|
|
||||||
|
if (ucols > 0) d_U_ = internal::DeviceBuffer(static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar));
|
||||||
|
if (vtrows > 0)
|
||||||
|
d_VT_ = internal::DeviceBuffer(static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar));
|
||||||
|
|
||||||
|
// computeType must match the matrix data type (dtype), not the singular value type (rtype).
|
||||||
|
eigen_assert(m_ >= n_ && "Internal error: m_ < n_ should have been handled by transpose in compute()");
|
||||||
|
size_t dev_ws = 0, host_ws = 0;
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd_bufferSize(
|
||||||
|
handle_, params_.p, jobu(int_opts), jobvt(int_opts), static_cast<int64_t>(m_), static_cast<int64_t>(n_), dtype,
|
||||||
|
d_A_.ptr, lda_, rtype, d_S_.ptr, dtype, ucols > 0 ? d_U_.ptr : nullptr, ldu, dtype,
|
||||||
|
vtrows > 0 ? d_VT_.ptr : nullptr, ldvt, dtype, &dev_ws, &host_ws));
|
||||||
|
|
||||||
|
ensure_scratch(dev_ws);
|
||||||
|
h_workspace_.resize(host_ws);
|
||||||
|
|
||||||
|
// Compute SVD.
|
||||||
|
EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd(handle_, params_.p, jobu(int_opts), jobvt(int_opts), static_cast<int64_t>(m_),
|
||||||
|
static_cast<int64_t>(n_), dtype, d_A_.ptr, lda_, rtype, d_S_.ptr, dtype,
|
||||||
|
ucols > 0 ? d_U_.ptr : nullptr, ldu, dtype, vtrows > 0 ? d_VT_.ptr : nullptr,
|
||||||
|
ldvt, dtype, scratch_workspace(), dev_ws,
|
||||||
|
host_ws > 0 ? h_workspace_.data() : nullptr, host_ws, scratch_info()));
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(&info_word_, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Internal solve: X = V * diag(D) * U^H * B, using top `trunc` triplets.
|
||||||
|
// D_ii = 1/S_i (if lambda==0) or S_i/(S_i^2+lambda^2).
|
||||||
|
//
|
||||||
|
// For non-transposed: stored U, VT. X = VT^H * D * U^H * B.
|
||||||
|
// For transposed (SVD of A^H): stored U', VT'. X = U' * D * VT' * B.
|
||||||
|
template <typename Rhs>
|
||||||
|
PlainMatrix solve_impl(const MatrixBase<Rhs>& B, Index trunc, RealScalar lambda) const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success && "GpuSVD::solve called on a failed or uninitialized decomposition");
|
||||||
|
eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "solve requires U");
|
||||||
|
eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "solve requires V");
|
||||||
|
|
||||||
|
const Index m_orig = transposed_ ? n_ : m_;
|
||||||
|
const Index n_orig = transposed_ ? m_ : n_;
|
||||||
|
eigen_assert(B.rows() == m_orig);
|
||||||
|
|
||||||
|
const Index k = (std::min)(m_, n_); // = min(m_orig, n_orig)
|
||||||
|
const Index kk = (std::min)(trunc, k);
|
||||||
|
const Index nrhs = B.cols();
|
||||||
|
|
||||||
|
// Download S to host to build the diagonal scaling.
|
||||||
|
RealVector S(k);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpy(S.data(), d_S_.ptr, static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
|
||||||
|
// Upload B (m_orig × nrhs).
|
||||||
|
const PlainMatrix rhs(B);
|
||||||
|
internal::DeviceBuffer d_B(static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.ptr, rhs.data(),
|
||||||
|
static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||||
|
cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
// Step 1: tmp = U_orig^H * B (kk × nrhs).
|
||||||
|
// Non-transposed: U_stored is m_×ucols, U_orig = U_stored. Use U_stored^H * B.
|
||||||
|
// Transposed: U_orig = VT_stored^H, so U_orig^H = VT_stored. Use VT_stored * B (no transpose!).
|
||||||
|
internal::DeviceBuffer d_tmp(static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar));
|
||||||
|
{
|
||||||
|
Scalar alpha_one(1), beta_zero(0);
|
||||||
|
|
||||||
|
if (!transposed_) {
|
||||||
|
// U_stored^H * B: (m_×kk)^H × (m_×nrhs) → kk×nrhs.
|
||||||
|
internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_C, CUBLAS_OP_N, kk, nrhs, m_, &alpha_one,
|
||||||
|
static_cast<const Scalar*>(d_U_.ptr), m_, static_cast<const Scalar*>(d_B.ptr),
|
||||||
|
m_orig, &beta_zero, static_cast<Scalar*>(d_tmp.ptr), kk, &gemm_workspace_,
|
||||||
|
stream_);
|
||||||
|
} else {
|
||||||
|
// VT_stored * B: VT_stored is vtrows×n_ = kk×m_orig (thin), NoTrans.
|
||||||
|
// vtrows×m_orig times m_orig×nrhs → vtrows×nrhs. Use first kk rows.
|
||||||
|
const Index vtrows_stored = (swap_uv_options(options_) & ComputeFullV) ? n_ : k;
|
||||||
|
internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_N, CUBLAS_OP_N, kk, nrhs, m_orig, &alpha_one,
|
||||||
|
static_cast<const Scalar*>(d_VT_.ptr), vtrows_stored,
|
||||||
|
static_cast<const Scalar*>(d_B.ptr), m_orig, &beta_zero,
|
||||||
|
static_cast<Scalar*>(d_tmp.ptr), kk, &gemm_workspace_, stream_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Scale row i of tmp by D_ii.
|
||||||
|
// Download tmp to host, scale, re-upload. (Simple and correct; a device kernel would be faster.)
|
||||||
|
{
|
||||||
|
PlainMatrix tmp(kk, nrhs);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(tmp.data(), d_tmp.ptr,
|
||||||
|
static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
|
||||||
|
for (Index i = 0; i < kk; ++i) {
|
||||||
|
RealScalar si = S(i);
|
||||||
|
RealScalar di = (lambda == RealScalar(0)) ? (si > 0 ? RealScalar(1) / si : RealScalar(0))
|
||||||
|
: si / (si * si + lambda * lambda);
|
||||||
|
tmp.row(i) *= Scalar(di);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_tmp.ptr, tmp.data(),
|
||||||
|
static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||||
|
cudaMemcpyHostToDevice, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 3: X = V_orig * tmp (n_orig × nrhs).
|
||||||
|
// Non-transposed: V_orig = VT_stored^H. VT_stored[:kk,:]^H * tmp → n_orig × nrhs.
|
||||||
|
// Transposed: V_orig = U_stored[:,:kk]. U_stored * tmp → n_orig × nrhs (NoTrans).
|
||||||
|
PlainMatrix X(n_orig, nrhs);
|
||||||
|
{
|
||||||
|
internal::DeviceBuffer d_X(static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
|
||||||
|
Scalar alpha_one(1), beta_zero(0);
|
||||||
|
|
||||||
|
if (!transposed_) {
|
||||||
|
const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
|
||||||
|
internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_C, CUBLAS_OP_N, n_orig, nrhs, kk, &alpha_one,
|
||||||
|
static_cast<const Scalar*>(d_VT_.ptr), vtrows,
|
||||||
|
static_cast<const Scalar*>(d_tmp.ptr), kk, &beta_zero,
|
||||||
|
static_cast<Scalar*>(d_X.ptr), n_orig, &gemm_workspace_, stream_);
|
||||||
|
} else {
|
||||||
|
// U_stored is m_×ucols. V_orig = U_stored[:,:kk]. NoTrans × tmp.
|
||||||
|
internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_N, CUBLAS_OP_N, n_orig, nrhs, kk, &alpha_one,
|
||||||
|
static_cast<const Scalar*>(d_U_.ptr), m_, static_cast<const Scalar*>(d_tmp.ptr),
|
||||||
|
kk, &beta_zero, static_cast<Scalar*>(d_X.ptr), n_orig, &gemm_workspace_,
|
||||||
|
stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_X.ptr,
|
||||||
|
static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
|
||||||
|
cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
return X;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_SVD_H
|
||||||
481
Eigen/src/GPU/GpuSparseContext.h
Normal file
481
Eigen/src/GPU/GpuSparseContext.h
Normal file
@@ -0,0 +1,481 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix
|
||||||
|
// multiply (SpMM) via cuSPARSE.
|
||||||
|
//
|
||||||
|
// GpuSparseContext manages cuSPARSE descriptors and device buffers. It accepts
|
||||||
|
// Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the GPU.
|
||||||
|
// RowMajor input is implicitly converted to ColMajor.
|
||||||
|
//
|
||||||
|
// Can borrow a GpuContext for same-stream execution with BLAS-1 ops (zero
|
||||||
|
// event overhead in iterative solvers like CG).
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// // Standalone (own stream):
|
||||||
|
// GpuSparseContext<double> ctx;
|
||||||
|
// VectorXd y = ctx.multiply(A, x);
|
||||||
|
//
|
||||||
|
// // Shared context (same stream as BLAS-1 ops):
|
||||||
|
// GpuContext gpu_ctx;
|
||||||
|
// GpuSparseContext<double> sparse_ctx(gpu_ctx);
|
||||||
|
// VectorXd y = sparse_ctx.multiply(A, x);
|
||||||
|
//
|
||||||
|
// // Device-resident (no host roundtrip):
|
||||||
|
// sparse_ctx.multiply(A, d_x, d_y); // DeviceMatrix in/out
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_SPARSE_CONTEXT_H
|
||||||
|
#define EIGEN_GPU_SPARSE_CONTEXT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuSparseSupport.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
// Forward declarations.
|
||||||
|
template <typename Scalar_>
|
||||||
|
class GpuSparseContext;
|
||||||
|
template <typename Scalar_>
|
||||||
|
class DeviceSparseView;
|
||||||
|
|
||||||
|
/** SpMV expression: DeviceSparseView * DeviceMatrix → SpMVExpr.
|
||||||
|
* Evaluated by DeviceMatrix::operator=(SpMVExpr). */
|
||||||
|
template <typename Scalar_>
|
||||||
|
class SpMVExpr {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
SpMVExpr(const DeviceSparseView<Scalar>& view, const DeviceMatrix<Scalar>& x) : view_(view), x_(x) {}
|
||||||
|
const DeviceSparseView<Scalar>& view() const { return view_; }
|
||||||
|
const DeviceMatrix<Scalar>& x() const { return x_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const DeviceSparseView<Scalar>& view_;
|
||||||
|
const DeviceMatrix<Scalar>& x_;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Device-resident sparse matrix view. Returned by GpuSparseContext::deviceView().
|
||||||
|
* Lightweight handle referencing the context's cached device data.
|
||||||
|
*
|
||||||
|
* \warning One GpuSparseContext caches one sparse matrix at a time.
|
||||||
|
* Creating a second deviceView on the same context overwrites the first.
|
||||||
|
* For multiple simultaneous sparse matrices, use separate GpuSparseContext
|
||||||
|
* instances (they can share a GpuContext for same-stream execution).
|
||||||
|
*
|
||||||
|
* Supports `d_y = d_A * d_x` via SpMVExpr. */
|
||||||
|
template <typename Scalar_>
|
||||||
|
class DeviceSparseView {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
|
||||||
|
DeviceSparseView(GpuSparseContext<Scalar>& ctx, const SpMat& A) : ctx_(ctx), A_(A) {}
|
||||||
|
|
||||||
|
/** SpMV expression: d_A * d_x. Evaluated by DeviceMatrix::operator=. */
|
||||||
|
SpMVExpr<Scalar> operator*(const DeviceMatrix<Scalar>& x) const { return SpMVExpr<Scalar>(*this, x); }
|
||||||
|
|
||||||
|
Index rows() const { return A_.rows(); }
|
||||||
|
Index cols() const { return A_.cols(); }
|
||||||
|
const GpuSparseContext<Scalar>& context() const { return ctx_; }
|
||||||
|
const SpMat& matrix() const { return A_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
GpuSparseContext<Scalar>& ctx_;
|
||||||
|
const SpMat& A_;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
class GpuSparseContext {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
using StorageIndex = int;
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
|
||||||
|
using DenseVector = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||||
|
|
||||||
|
/** Standalone: creates own stream and cuSPARSE handle. */
|
||||||
|
GpuSparseContext() : owns_handle_(true) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
owns_stream_ = true;
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_));
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Borrow a GpuContext: shares stream and cuSPARSE handle.
|
||||||
|
* The GpuContext must outlive this GpuSparseContext. */
|
||||||
|
explicit GpuSparseContext(GpuContext& ctx)
|
||||||
|
: stream_(ctx.stream()), handle_(ctx.cusparseHandle()), owns_stream_(false), owns_handle_(false) {}
|
||||||
|
|
||||||
|
~GpuSparseContext() {
|
||||||
|
destroy_descriptors();
|
||||||
|
if (owns_handle_ && handle_) (void)cusparseDestroy(handle_);
|
||||||
|
if (owns_stream_ && stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuSparseContext(const GpuSparseContext&) = delete;
|
||||||
|
GpuSparseContext& operator=(const GpuSparseContext&) = delete;
|
||||||
|
|
||||||
|
// ---- Device sparse view (for expression syntax: d_y = d_A * d_x) ----------
|
||||||
|
|
||||||
|
/** Upload a sparse matrix to device and return a lightweight view.
|
||||||
|
* The sparse data is uploaded immediately and cached in this context.
|
||||||
|
* The returned view can be used for repeated SpMV without re-uploading.
|
||||||
|
* If the matrix values change, call deviceView() again to re-upload.
|
||||||
|
*
|
||||||
|
* \warning One context caches one matrix. Calling deviceView() again
|
||||||
|
* overwrites the previous upload. For multiple simultaneous matrices,
|
||||||
|
* use separate GpuSparseContext instances sharing the same GpuContext.
|
||||||
|
*
|
||||||
|
* Supports `d_y = d_A * d_x` expression syntax. */
|
||||||
|
DeviceSparseView<Scalar> deviceView(const SpMat& A) {
|
||||||
|
eigen_assert(A.isCompressed());
|
||||||
|
upload_sparse(A);
|
||||||
|
return DeviceSparseView<Scalar>(*this, A);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMV: y = A * x (host vectors) --------------------------------------
|
||||||
|
|
||||||
|
/** Compute y = A * x. Returns y as a new dense vector. */
|
||||||
|
template <typename InputType, typename Rhs>
|
||||||
|
DenseVector multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
|
||||||
|
const SpMat mat(A.derived());
|
||||||
|
DenseVector y(mat.rows());
|
||||||
|
y.setZero();
|
||||||
|
multiply_host_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
|
||||||
|
return y;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Compute y = alpha * op(A) * x + beta * y (in-place, host vectors). */
|
||||||
|
template <typename InputType, typename Rhs, typename Dest>
|
||||||
|
void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y,
|
||||||
|
Scalar alpha = Scalar(1), Scalar beta = Scalar(0),
|
||||||
|
cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
|
||||||
|
const SpMat mat(A.derived());
|
||||||
|
multiply_host_impl(mat, x.derived(), y.derived(), alpha, beta, op);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMV: y = A * x (DeviceMatrix, no host roundtrip) -------------------
|
||||||
|
|
||||||
|
/** Compute d_y = A * d_x. Device-resident, no host transfer.
|
||||||
|
* Sparse matrix A is uploaded to device (cached). Dense vectors stay on device. */
|
||||||
|
template <typename InputType>
|
||||||
|
void multiply(const SparseMatrixBase<InputType>& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y) {
|
||||||
|
const SpMat mat(A.derived());
|
||||||
|
multiply_device_impl(mat, d_x, d_y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Compute d_y = alpha * op(A) * d_x + beta * d_y (DeviceMatrix, in-place). */
|
||||||
|
template <typename InputType>
|
||||||
|
void multiply(const SparseMatrixBase<InputType>& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y,
|
||||||
|
Scalar alpha, Scalar beta, cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
|
||||||
|
const SpMat mat(A.derived());
|
||||||
|
multiply_device_impl(mat, d_x, d_y, alpha, beta, op);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMV transpose -------------------------------------------------------
|
||||||
|
|
||||||
|
/** Compute y = A^T * x (host vectors). */
|
||||||
|
template <typename InputType, typename Rhs>
|
||||||
|
DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
|
||||||
|
const SpMat mat(A.derived());
|
||||||
|
DenseVector y(mat.cols());
|
||||||
|
y.setZero();
|
||||||
|
multiply_host_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
|
||||||
|
return y;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMM: Y = A * X (host, multiple RHS) --------------------------------
|
||||||
|
|
||||||
|
/** Compute Y = A * X where X is a dense matrix. Returns Y. */
|
||||||
|
template <typename InputType, typename Rhs>
|
||||||
|
DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X) {
|
||||||
|
const SpMat mat(A.derived());
|
||||||
|
const DenseMatrix rhs(X.derived());
|
||||||
|
eigen_assert(mat.cols() == rhs.rows());
|
||||||
|
|
||||||
|
const Index m = mat.rows();
|
||||||
|
const Index n = rhs.cols();
|
||||||
|
if (m == 0 || n == 0 || mat.nonZeros() == 0) return DenseMatrix::Zero(m, n);
|
||||||
|
|
||||||
|
DenseMatrix Y = DenseMatrix::Zero(m, n);
|
||||||
|
spmm_impl(mat, rhs, Y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
|
||||||
|
return Y;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors ------------------------------------------------------------
|
||||||
|
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cusparseHandle_t handle_ = nullptr;
|
||||||
|
bool owns_stream_ = false;
|
||||||
|
bool owns_handle_ = false;
|
||||||
|
|
||||||
|
// Cached device buffers for sparse matrix (grow-only).
|
||||||
|
internal::DeviceBuffer d_outerPtr_;
|
||||||
|
internal::DeviceBuffer d_innerIdx_;
|
||||||
|
internal::DeviceBuffer d_values_;
|
||||||
|
size_t d_outerPtr_size_ = 0;
|
||||||
|
size_t d_innerIdx_size_ = 0;
|
||||||
|
size_t d_values_size_ = 0;
|
||||||
|
|
||||||
|
// Cached device buffers for host-API dense vectors (grow-only).
|
||||||
|
internal::DeviceBuffer d_x_;
|
||||||
|
internal::DeviceBuffer d_y_;
|
||||||
|
size_t d_x_size_ = 0;
|
||||||
|
size_t d_y_size_ = 0;
|
||||||
|
|
||||||
|
mutable internal::DeviceBuffer d_workspace_;
|
||||||
|
mutable size_t d_workspace_size_ = 0;
|
||||||
|
|
||||||
|
// Cached cuSPARSE sparse matrix descriptor.
|
||||||
|
cusparseSpMatDescr_t spmat_desc_ = nullptr;
|
||||||
|
Index cached_rows_ = -1;
|
||||||
|
Index cached_cols_ = -1;
|
||||||
|
Index cached_nnz_ = -1;
|
||||||
|
|
||||||
|
// ---- SpMV with host vectors (upload/download per call) --------------------
|
||||||
|
|
||||||
|
template <typename RhsDerived, typename DestDerived>
|
||||||
|
void multiply_host_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
|
||||||
|
cusparseOperation_t op) {
|
||||||
|
eigen_assert(A.isCompressed());
|
||||||
|
|
||||||
|
const Index m = A.rows();
|
||||||
|
const Index n = A.cols();
|
||||||
|
const Index nnz = A.nonZeros();
|
||||||
|
const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
|
||||||
|
const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
|
||||||
|
|
||||||
|
eigen_assert(x.size() == x_size);
|
||||||
|
eigen_assert(y.size() == y_size);
|
||||||
|
|
||||||
|
if (m == 0 || n == 0 || nnz == 0) {
|
||||||
|
if (beta == Scalar(0))
|
||||||
|
y.setZero();
|
||||||
|
else
|
||||||
|
y *= beta;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
upload_sparse(A);
|
||||||
|
|
||||||
|
ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar));
|
||||||
|
const DenseVector x_tmp(x);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_x_.ptr, x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar));
|
||||||
|
if (beta != Scalar(0)) {
|
||||||
|
const DenseVector y_tmp(y);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_y_.ptr, y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
exec_spmv(x_size, y_size, d_x_.ptr, d_y_.ptr, alpha, beta, op);
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(y.data(), d_y_.ptr, y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMV with DeviceMatrix (no host transfer) ----------------------------
|
||||||
|
|
||||||
|
// Called by public multiply(A, d_x, d_y) — always re-uploads A.
|
||||||
|
void multiply_device_impl(const SpMat& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y, Scalar alpha,
|
||||||
|
Scalar beta, cusparseOperation_t op) {
|
||||||
|
upload_sparse(A);
|
||||||
|
spmv_device_exec(d_x, d_y, alpha, beta, op);
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
/** Execute SpMV using the already-uploaded sparse matrix (no re-upload).
|
||||||
|
* Used by SpMVExpr (d_y = d_A * d_x) for cached deviceView() paths.
|
||||||
|
* The sparse matrix must have been uploaded via deviceView() or multiply(). */
|
||||||
|
void spmv_device_exec(const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y, Scalar alpha = Scalar(1),
|
||||||
|
Scalar beta = Scalar(0), cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) const {
|
||||||
|
eigen_assert(spmat_desc_ && "sparse matrix not uploaded — call deviceView() or multiply() first");
|
||||||
|
// cuSPARSE SpMV: y must not alias x (undefined behavior).
|
||||||
|
eigen_assert(d_x.data() != d_y.data() && "SpMV: output aliases input vector");
|
||||||
|
|
||||||
|
const Index m = cached_rows_;
|
||||||
|
const Index n = cached_cols_;
|
||||||
|
const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
|
||||||
|
const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
|
||||||
|
|
||||||
|
eigen_assert(d_x.rows() * d_x.cols() == x_size);
|
||||||
|
|
||||||
|
if (m == 0 || n == 0 || cached_nnz_ == 0) {
|
||||||
|
d_y.resize(y_size, 1);
|
||||||
|
if (beta == Scalar(0)) {
|
||||||
|
d_y.setZero();
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure d_y is allocated.
|
||||||
|
if (d_y.rows() * d_y.cols() != y_size) {
|
||||||
|
d_y.resize(y_size, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for input data to be ready on this stream.
|
||||||
|
d_x.waitReady(stream_);
|
||||||
|
d_y.waitReady(stream_);
|
||||||
|
|
||||||
|
exec_spmv(x_size, y_size, const_cast<void*>(static_cast<const void*>(d_x.data())), static_cast<void*>(d_y.data()),
|
||||||
|
alpha, beta, op);
|
||||||
|
|
||||||
|
d_y.recordReady(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
// ---- Shared SpMV execution ------------------------------------------------
|
||||||
|
|
||||||
|
void exec_spmv(Index x_size, Index y_size, void* d_x_ptr, void* d_y_ptr, Scalar alpha, Scalar beta,
|
||||||
|
cusparseOperation_t op) const {
|
||||||
|
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
|
||||||
|
cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr;
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_ptr, dtype));
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_ptr, dtype));
|
||||||
|
|
||||||
|
size_t ws_size = 0;
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
|
||||||
|
CUSPARSE_SPMV_ALG_DEFAULT, &ws_size));
|
||||||
|
ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
|
||||||
|
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
|
||||||
|
CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.ptr));
|
||||||
|
|
||||||
|
(void)cusparseDestroyDnVec(x_desc);
|
||||||
|
(void)cusparseDestroyDnVec(y_desc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMM implementation --------------------------------------------------
|
||||||
|
|
||||||
|
void spmm_impl(const SpMat& A, const DenseMatrix& X, DenseMatrix& Y, Scalar alpha, Scalar beta,
|
||||||
|
cusparseOperation_t op) {
|
||||||
|
eigen_assert(A.isCompressed());
|
||||||
|
|
||||||
|
const Index m = A.rows();
|
||||||
|
const Index n = X.cols();
|
||||||
|
const Index k = A.cols();
|
||||||
|
const Index nnz = A.nonZeros();
|
||||||
|
|
||||||
|
if (m == 0 || n == 0 || k == 0 || nnz == 0) {
|
||||||
|
if (beta == Scalar(0))
|
||||||
|
Y.setZero();
|
||||||
|
else
|
||||||
|
Y *= beta;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
upload_sparse(A);
|
||||||
|
|
||||||
|
const size_t x_bytes = static_cast<size_t>(k) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||||
|
const size_t y_bytes = static_cast<size_t>(m) * static_cast<size_t>(n) * sizeof(Scalar);
|
||||||
|
ensure_buffer(d_x_, d_x_size_, x_bytes);
|
||||||
|
ensure_buffer(d_y_, d_y_size_, y_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_x_.ptr, X.data(), x_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
if (beta != Scalar(0)) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.ptr, Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
|
||||||
|
cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr;
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k, n, k, d_x_.ptr, dtype, CUSPARSE_ORDER_COL));
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m, n, m, d_y_.ptr, dtype, CUSPARSE_ORDER_COL));
|
||||||
|
|
||||||
|
size_t ws_size = 0;
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_,
|
||||||
|
x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size));
|
||||||
|
ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
|
||||||
|
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta,
|
||||||
|
y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.ptr));
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.ptr, y_bytes, cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
|
||||||
|
(void)cusparseDestroyDnMat(x_desc);
|
||||||
|
(void)cusparseDestroyDnMat(y_desc);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Helpers --------------------------------------------------------------
|
||||||
|
|
||||||
|
void upload_sparse(const SpMat& A) {
|
||||||
|
const Index m = A.rows();
|
||||||
|
const Index n = A.cols();
|
||||||
|
const Index nnz = A.nonZeros();
|
||||||
|
|
||||||
|
const size_t outer_bytes = static_cast<size_t>(n + 1) * sizeof(StorageIndex);
|
||||||
|
const size_t inner_bytes = static_cast<size_t>(nnz) * sizeof(StorageIndex);
|
||||||
|
const size_t val_bytes = static_cast<size_t>(nnz) * sizeof(Scalar);
|
||||||
|
|
||||||
|
ensure_buffer(d_outerPtr_, d_outerPtr_size_, outer_bytes);
|
||||||
|
ensure_buffer(d_innerIdx_, d_innerIdx_size_, inner_bytes);
|
||||||
|
ensure_buffer(d_values_, d_values_size_, val_bytes);
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_outerPtr_.ptr, A.outerIndexPtr(), outer_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(
|
||||||
|
cudaMemcpyAsync(d_innerIdx_.ptr, A.innerIndexPtr(), inner_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, A.valuePtr(), val_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) {
|
||||||
|
destroy_descriptors();
|
||||||
|
|
||||||
|
constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
|
||||||
|
constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value;
|
||||||
|
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr,
|
||||||
|
idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO, val_type));
|
||||||
|
cached_rows_ = m;
|
||||||
|
cached_cols_ = n;
|
||||||
|
cached_nnz_ = nnz;
|
||||||
|
} else {
|
||||||
|
EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void destroy_descriptors() {
|
||||||
|
if (spmat_desc_) {
|
||||||
|
(void)cusparseDestroySpMat(spmat_desc_);
|
||||||
|
spmat_desc_ = nullptr;
|
||||||
|
}
|
||||||
|
cached_rows_ = -1;
|
||||||
|
cached_cols_ = -1;
|
||||||
|
cached_nnz_ = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) const {
|
||||||
|
if (needed > current_size) {
|
||||||
|
if (buf.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
buf = internal::DeviceBuffer(needed);
|
||||||
|
current_size = needed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- DeviceMatrix::operator=(SpMVExpr) out-of-line definition ----------------
|
||||||
|
// Defined here because it needs the full GpuSparseContext definition.
|
||||||
|
|
||||||
|
template <typename Scalar_>
|
||||||
|
DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const SpMVExpr<Scalar_>& expr) {
|
||||||
|
// Use spmv_device_exec — the sparse matrix was already uploaded by deviceView().
|
||||||
|
// No re-upload on repeated SpMV with the same view.
|
||||||
|
expr.view().context().spmv_device_exec(expr.x(), *this, Scalar_(1), Scalar_(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_SPARSE_CONTEXT_H
|
||||||
62
Eigen/src/GPU/GpuSparseLDLT.h
Normal file
62
Eigen/src/GPU/GpuSparseLDLT.h
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU sparse LDL^T / LDL^H factorization via cuDSS.
|
||||||
|
//
|
||||||
|
// For symmetric indefinite (or Hermitian indefinite) sparse matrices.
|
||||||
|
// Same three-phase workflow as GpuSparseLLT.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuSparseLDLT<double> ldlt(A); // analyze + factorize
|
||||||
|
// VectorXd x = ldlt.solve(b); // solve
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_SPARSE_LDLT_H
|
||||||
|
#define EIGEN_GPU_SPARSE_LDLT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSparseSolverBase.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
/** GPU sparse LDL^T factorization (symmetric indefinite / Hermitian indefinite).
|
||||||
|
*
|
||||||
|
* Wraps cuDSS with CUDSS_MTYPE_SYMMETRIC (real) or CUDSS_MTYPE_HERMITIAN (complex).
|
||||||
|
* Uses pivoting for numerical stability.
|
||||||
|
*
|
||||||
|
* \tparam Scalar_ float, double, complex<float>, or complex<double>
|
||||||
|
* \tparam UpLo_ Lower (default) or Upper — which triangle of A is stored
|
||||||
|
*/
|
||||||
|
template <typename Scalar_, int UpLo_ = Lower>
|
||||||
|
class GpuSparseLDLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT<Scalar_, UpLo_>> {
|
||||||
|
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLDLT>;
|
||||||
|
friend Base;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
enum { UpLo = UpLo_ };
|
||||||
|
|
||||||
|
GpuSparseLDLT() = default;
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
explicit GpuSparseLDLT(const SparseMatrixBase<InputType>& A) {
|
||||||
|
this->compute(A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr bool needs_csr_conversion() { return false; }
|
||||||
|
static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_symmetric_type<Scalar>::value; }
|
||||||
|
static constexpr cudssMatrixViewType_t cudss_matrix_view() {
|
||||||
|
return internal::cudss_view_type<UpLo, ColMajor>::value;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_SPARSE_LDLT_H
|
||||||
62
Eigen/src/GPU/GpuSparseLLT.h
Normal file
62
Eigen/src/GPU/GpuSparseLLT.h
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU sparse Cholesky (LL^T / LL^H) via cuDSS.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuSparseLLT<double> llt(A); // analyze + factorize
|
||||||
|
// VectorXd x = llt.solve(b); // solve
|
||||||
|
// llt.analyzePattern(A); // or separate phases
|
||||||
|
// llt.factorize(A_new); // reuse symbolic analysis
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_SPARSE_LLT_H
|
||||||
|
#define EIGEN_GPU_SPARSE_LLT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSparseSolverBase.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
/** GPU sparse Cholesky factorization (LL^T for real, LL^H for complex).
|
||||||
|
*
|
||||||
|
* Wraps cuDSS with CUDSS_MTYPE_SPD (real) or CUDSS_MTYPE_HPD (complex).
|
||||||
|
* Accepts ColMajor SparseMatrix (CSC), reinterpreted as CSR with swapped
|
||||||
|
* triangle view for zero-copy upload.
|
||||||
|
*
|
||||||
|
* \tparam Scalar_ float, double, complex<float>, or complex<double>
|
||||||
|
* \tparam UpLo_ Lower (default) or Upper — which triangle of A is stored
|
||||||
|
*/
|
||||||
|
template <typename Scalar_, int UpLo_ = Lower>
|
||||||
|
class GpuSparseLLT : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT<Scalar_, UpLo_>> {
|
||||||
|
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLLT>;
|
||||||
|
friend Base;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
enum { UpLo = UpLo_ };
|
||||||
|
|
||||||
|
GpuSparseLLT() = default;
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
explicit GpuSparseLLT(const SparseMatrixBase<InputType>& A) {
|
||||||
|
this->compute(A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr bool needs_csr_conversion() { return false; }
|
||||||
|
static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_spd_type<Scalar>::value; }
|
||||||
|
static constexpr cudssMatrixViewType_t cudss_matrix_view() {
|
||||||
|
return internal::cudss_view_type<UpLo, ColMajor>::value;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_SPARSE_LLT_H
|
||||||
59
Eigen/src/GPU/GpuSparseLU.h
Normal file
59
Eigen/src/GPU/GpuSparseLU.h
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// GPU sparse LU factorization via cuDSS.
|
||||||
|
//
|
||||||
|
// For general (non-symmetric) sparse matrices. Uses pivoting.
|
||||||
|
// Same three-phase workflow as GpuSparseLLT.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuSparseLU<double> lu(A); // analyze + factorize
|
||||||
|
// VectorXd x = lu.solve(b); // solve
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_SPARSE_LU_H
|
||||||
|
#define EIGEN_GPU_SPARSE_LU_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./GpuSparseSolverBase.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
|
||||||
|
/** GPU sparse LU factorization (general matrices).
|
||||||
|
*
|
||||||
|
* Wraps cuDSS with CUDSS_MTYPE_GENERAL and CUDSS_MVIEW_FULL.
|
||||||
|
* Accepts ColMajor SparseMatrix (CSC); internally converts to RowMajor
|
||||||
|
* CSR since cuDSS requires CSR input.
|
||||||
|
*
|
||||||
|
* \tparam Scalar_ float, double, complex<float>, or complex<double>
|
||||||
|
*/
|
||||||
|
template <typename Scalar_>
|
||||||
|
class GpuSparseLU : public internal::GpuSparseSolverBase<Scalar_, GpuSparseLU<Scalar_>> {
|
||||||
|
using Base = internal::GpuSparseSolverBase<Scalar_, GpuSparseLU>;
|
||||||
|
friend Base;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
|
||||||
|
GpuSparseLU() = default;
|
||||||
|
|
||||||
|
template <typename InputType>
|
||||||
|
explicit GpuSparseLU(const SparseMatrixBase<InputType>& A) {
|
||||||
|
this->compute(A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr bool needs_csr_conversion() { return true; }
|
||||||
|
static constexpr cudssMatrixType_t cudss_matrix_type() { return CUDSS_MTYPE_GENERAL; }
|
||||||
|
static constexpr cudssMatrixViewType_t cudss_matrix_view() { return CUDSS_MVIEW_FULL; }
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_SPARSE_LU_H
|
||||||
356
Eigen/src/GPU/GpuSparseSolverBase.h
Normal file
356
Eigen/src/GPU/GpuSparseSolverBase.h
Normal file
@@ -0,0 +1,356 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Common base for GPU sparse direct solvers (LLT, LDLT, LU) via cuDSS.
|
||||||
|
//
|
||||||
|
// All three solver types share the same three-phase workflow
|
||||||
|
// (analyzePattern → factorize → solve) and differ only in the
|
||||||
|
// cudssMatrixType_t and cudssMatrixViewType_t passed to cuDSS.
|
||||||
|
// This CRTP base implements the entire workflow; derived classes
|
||||||
|
// provide the matrix type/view via static constexpr members.
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_SPARSE_SOLVER_BASE_H
|
||||||
|
#define EIGEN_GPU_SPARSE_SOLVER_BASE_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include "./CuDssSupport.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
/** CRTP base for GPU sparse direct solvers.
|
||||||
|
*
|
||||||
|
* \tparam Scalar_ Element type (passed explicitly to avoid incomplete-type issues with CRTP).
|
||||||
|
* \tparam Derived The concrete solver class (GpuSparseLLT, GpuSparseLDLT, GpuSparseLU).
|
||||||
|
* Must provide:
|
||||||
|
* - `static constexpr cudssMatrixType_t cudss_matrix_type()`
|
||||||
|
* - `static constexpr cudssMatrixViewType_t cudss_matrix_view()`
|
||||||
|
*/
|
||||||
|
template <typename Scalar_, typename Derived>
|
||||||
|
class GpuSparseSolverBase {
|
||||||
|
public:
|
||||||
|
using Scalar = Scalar_;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
using StorageIndex = int;
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
|
||||||
|
using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>;
|
||||||
|
using DenseVector = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
|
||||||
|
|
||||||
|
GpuSparseSolverBase() { init_context(); }
|
||||||
|
|
||||||
|
~GpuSparseSolverBase() {
|
||||||
|
destroy_cudss_objects();
|
||||||
|
if (handle_) (void)cudssDestroy(handle_);
|
||||||
|
if (stream_) (void)cudaStreamDestroy(stream_);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuSparseSolverBase(const GpuSparseSolverBase&) = delete;
|
||||||
|
GpuSparseSolverBase& operator=(const GpuSparseSolverBase&) = delete;
|
||||||
|
|
||||||
|
// ---- Configuration --------------------------------------------------------
|
||||||
|
|
||||||
|
/** Set the fill-reducing ordering algorithm. Must be called before compute/analyzePattern. */
|
||||||
|
void setOrdering(GpuSparseOrdering ordering) { ordering_ = ordering; }
|
||||||
|
|
||||||
|
// ---- Factorization --------------------------------------------------------
|
||||||
|
|
||||||
|
/** Symbolic analysis + numeric factorization. */
|
||||||
|
template <typename InputType>
|
||||||
|
Derived& compute(const SparseMatrixBase<InputType>& A) {
|
||||||
|
analyzePattern(A);
|
||||||
|
if (info_ == Success) {
|
||||||
|
factorize(A);
|
||||||
|
}
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Symbolic analysis only. Uploads sparsity structure to device.
|
||||||
|
* This phase is synchronous (blocks until complete). */
|
||||||
|
template <typename InputType>
|
||||||
|
Derived& analyzePattern(const SparseMatrixBase<InputType>& A) {
|
||||||
|
const SpMat csc(A.derived());
|
||||||
|
eigen_assert(csc.rows() == csc.cols() && "GpuSparseSolver requires a square matrix");
|
||||||
|
eigen_assert(csc.isCompressed() && "GpuSparseSolver requires a compressed sparse matrix");
|
||||||
|
|
||||||
|
n_ = csc.rows();
|
||||||
|
info_ = InvalidInput;
|
||||||
|
analysis_done_ = false;
|
||||||
|
|
||||||
|
if (n_ == 0) {
|
||||||
|
nnz_ = 0;
|
||||||
|
info_ = Success;
|
||||||
|
analysis_done_ = true;
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
// For symmetric solvers, ColMajor CSC can be reinterpreted as CSR with
|
||||||
|
// swapped triangle view (zero copy). For general solvers, we must convert
|
||||||
|
// to actual RowMajor CSR so cuDSS sees the correct matrix, not A^T.
|
||||||
|
if (Derived::needs_csr_conversion()) {
|
||||||
|
const CsrMat csr(csc);
|
||||||
|
nnz_ = csr.nonZeros();
|
||||||
|
upload_csr(csr);
|
||||||
|
} else {
|
||||||
|
nnz_ = csc.nonZeros();
|
||||||
|
upload_csr_from_csc(csc);
|
||||||
|
}
|
||||||
|
create_cudss_matrix();
|
||||||
|
apply_ordering_config();
|
||||||
|
|
||||||
|
if (data_) EIGEN_CUDSS_CHECK(cudssDataDestroy(handle_, data_));
|
||||||
|
EIGEN_CUDSS_CHECK(cudssDataCreate(handle_, &data_));
|
||||||
|
|
||||||
|
create_placeholder_dense();
|
||||||
|
|
||||||
|
EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_ANALYSIS, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
|
||||||
|
|
||||||
|
analysis_done_ = true;
|
||||||
|
info_ = Success;
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Numeric factorization using the symbolic analysis from analyzePattern.
|
||||||
|
*
|
||||||
|
* \warning The sparsity pattern (outerIndexPtr, innerIndexPtr) must be
|
||||||
|
* identical to the one passed to analyzePattern(). Only the numerical
|
||||||
|
* values may change. Passing a different pattern is undefined behavior.
|
||||||
|
* This matches the contract of CHOLMOD, UMFPACK, and cuDSS's own API.
|
||||||
|
*
|
||||||
|
* This phase is asynchronous — info() lazily synchronizes. */
|
||||||
|
template <typename InputType>
|
||||||
|
Derived& factorize(const SparseMatrixBase<InputType>& A) {
|
||||||
|
eigen_assert(analysis_done_ && "factorize() requires analyzePattern() first");
|
||||||
|
|
||||||
|
if (n_ == 0) {
|
||||||
|
info_ = Success;
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to the same format used in analyzePattern.
|
||||||
|
// Both temporaries must outlive the async memcpy (pageable H2D is actually
|
||||||
|
// synchronous w.r.t. the host, but keep them alive for clarity).
|
||||||
|
const SpMat csc(A.derived());
|
||||||
|
eigen_assert(csc.rows() == n_ && csc.cols() == n_);
|
||||||
|
|
||||||
|
const Scalar* value_ptr;
|
||||||
|
Index value_nnz;
|
||||||
|
CsrMat csr_tmp;
|
||||||
|
if (Derived::needs_csr_conversion()) {
|
||||||
|
csr_tmp = CsrMat(csc);
|
||||||
|
value_ptr = csr_tmp.valuePtr();
|
||||||
|
value_nnz = csr_tmp.nonZeros();
|
||||||
|
} else {
|
||||||
|
value_ptr = csc.valuePtr();
|
||||||
|
value_nnz = csc.nonZeros();
|
||||||
|
}
|
||||||
|
eigen_assert(value_nnz == nnz_);
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, value_ptr, static_cast<size_t>(nnz_) * sizeof(Scalar),
|
||||||
|
cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
EIGEN_CUDSS_CHECK(cudssMatrixSetValues(d_A_cudss_, d_values_.ptr));
|
||||||
|
|
||||||
|
info_ = InvalidInput;
|
||||||
|
info_synced_ = false;
|
||||||
|
EIGEN_CUDSS_CHECK(
|
||||||
|
cudssExecute(handle_, CUDSS_PHASE_FACTORIZATION, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
|
||||||
|
|
||||||
|
return derived();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve ----------------------------------------------------------------
|
||||||
|
|
||||||
|
/** Solve A * X = B. Returns X as a dense matrix.
|
||||||
|
* Supports single or multiple right-hand sides. */
|
||||||
|
template <typename Rhs>
|
||||||
|
DenseMatrix solve(const MatrixBase<Rhs>& B) const {
|
||||||
|
sync_info();
|
||||||
|
eigen_assert(info_ == Success && "GpuSparseSolver::solve requires a successful factorization");
|
||||||
|
eigen_assert(B.rows() == n_);
|
||||||
|
|
||||||
|
const DenseMatrix rhs(B);
|
||||||
|
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
|
||||||
|
|
||||||
|
if (n_ == 0) return DenseMatrix(0, rhs.cols());
|
||||||
|
|
||||||
|
const size_t rhs_bytes = static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar);
|
||||||
|
DeviceBuffer d_b(rhs_bytes);
|
||||||
|
DeviceBuffer d_x(rhs_bytes);
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_b.ptr, rhs.data(), rhs_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
|
||||||
|
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||||
|
cudssMatrix_t b_cudss = nullptr, x_cudss = nullptr;
|
||||||
|
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&b_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_b.ptr,
|
||||||
|
dtype, CUDSS_LAYOUT_COL_MAJOR));
|
||||||
|
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&x_cudss, static_cast<int64_t>(n_), nrhs, static_cast<int64_t>(n_), d_x.ptr,
|
||||||
|
dtype, CUDSS_LAYOUT_COL_MAJOR));
|
||||||
|
|
||||||
|
EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_SOLVE, config_, data_, d_A_cudss_, x_cudss, b_cudss));
|
||||||
|
|
||||||
|
DenseMatrix X(n_, rhs.cols());
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_x.ptr, rhs_bytes, cudaMemcpyDeviceToHost, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
|
||||||
|
(void)cudssMatrixDestroy(b_cudss);
|
||||||
|
(void)cudssMatrixDestroy(x_cudss);
|
||||||
|
|
||||||
|
return X;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Accessors ------------------------------------------------------------
|
||||||
|
|
||||||
|
ComputationInfo info() const {
|
||||||
|
sync_info();
|
||||||
|
return info_;
|
||||||
|
}
|
||||||
|
Index rows() const { return n_; }
|
||||||
|
Index cols() const { return n_; }
|
||||||
|
|
||||||
|
cudaStream_t stream() const { return stream_; }
|
||||||
|
|
||||||
|
protected:
|
||||||
|
// ---- CUDA / cuDSS handles -------------------------------------------------
|
||||||
|
cudaStream_t stream_ = nullptr;
|
||||||
|
cudssHandle_t handle_ = nullptr;
|
||||||
|
cudssConfig_t config_ = nullptr;
|
||||||
|
cudssData_t data_ = nullptr;
|
||||||
|
cudssMatrix_t d_A_cudss_ = nullptr;
|
||||||
|
cudssMatrix_t d_x_cudss_ = nullptr;
|
||||||
|
cudssMatrix_t d_b_cudss_ = nullptr;
|
||||||
|
|
||||||
|
// ---- Device buffers for CSR arrays ----------------------------------------
|
||||||
|
DeviceBuffer d_rowPtr_;
|
||||||
|
DeviceBuffer d_colIdx_;
|
||||||
|
DeviceBuffer d_values_;
|
||||||
|
|
||||||
|
// ---- State ----------------------------------------------------------------
|
||||||
|
Index n_ = 0;
|
||||||
|
Index nnz_ = 0;
|
||||||
|
ComputationInfo info_ = InvalidInput;
|
||||||
|
bool info_synced_ = true;
|
||||||
|
bool analysis_done_ = false;
|
||||||
|
GpuSparseOrdering ordering_ = GpuSparseOrdering::AMD;
|
||||||
|
|
||||||
|
private:
|
||||||
|
Derived& derived() { return static_cast<Derived&>(*this); }
|
||||||
|
const Derived& derived() const { return static_cast<const Derived&>(*this); }
|
||||||
|
|
||||||
|
void init_context() {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
|
||||||
|
EIGEN_CUDSS_CHECK(cudssCreate(&handle_));
|
||||||
|
EIGEN_CUDSS_CHECK(cudssSetStream(handle_, stream_));
|
||||||
|
EIGEN_CUDSS_CHECK(cudssConfigCreate(&config_));
|
||||||
|
}
|
||||||
|
|
||||||
|
void sync_info() const {
|
||||||
|
if (!info_synced_) {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
|
||||||
|
int cudss_info = 0;
|
||||||
|
EIGEN_CUDSS_CHECK(cudssDataGet(handle_, data_, CUDSS_DATA_INFO, &cudss_info, sizeof(cudss_info), nullptr));
|
||||||
|
auto* self = const_cast<GpuSparseSolverBase*>(this);
|
||||||
|
self->info_ = (cudss_info == 0) ? Success : NumericalIssue;
|
||||||
|
self->info_synced_ = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void destroy_cudss_objects() {
|
||||||
|
if (d_A_cudss_) {
|
||||||
|
(void)cudssMatrixDestroy(d_A_cudss_);
|
||||||
|
d_A_cudss_ = nullptr;
|
||||||
|
}
|
||||||
|
if (d_x_cudss_) {
|
||||||
|
(void)cudssMatrixDestroy(d_x_cudss_);
|
||||||
|
d_x_cudss_ = nullptr;
|
||||||
|
}
|
||||||
|
if (d_b_cudss_) {
|
||||||
|
(void)cudssMatrixDestroy(d_b_cudss_);
|
||||||
|
d_b_cudss_ = nullptr;
|
||||||
|
}
|
||||||
|
if (data_) {
|
||||||
|
(void)cudssDataDestroy(handle_, data_);
|
||||||
|
data_ = nullptr;
|
||||||
|
}
|
||||||
|
if (config_) {
|
||||||
|
(void)cudssConfigDestroy(config_);
|
||||||
|
config_ = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Upload CSR from a RowMajor sparse matrix (native CSR).
|
||||||
|
void upload_csr(const CsrMat& csr) { upload_compressed(csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr()); }
|
||||||
|
|
||||||
|
// Upload CSC arrays reinterpreted as CSR (for symmetric matrices: CSC(A) = CSR(A^T) = CSR(A)).
|
||||||
|
void upload_csr_from_csc(const SpMat& csc) {
|
||||||
|
upload_compressed(csc.outerIndexPtr(), csc.innerIndexPtr(), csc.valuePtr());
|
||||||
|
}
|
||||||
|
|
||||||
|
void upload_compressed(const StorageIndex* outer, const StorageIndex* inner, const Scalar* values) {
|
||||||
|
const size_t rowptr_bytes = static_cast<size_t>(n_ + 1) * sizeof(StorageIndex);
|
||||||
|
const size_t colidx_bytes = static_cast<size_t>(nnz_) * sizeof(StorageIndex);
|
||||||
|
const size_t values_bytes = static_cast<size_t>(nnz_) * sizeof(Scalar);
|
||||||
|
|
||||||
|
d_rowPtr_ = DeviceBuffer(rowptr_bytes);
|
||||||
|
d_colIdx_ = DeviceBuffer(colidx_bytes);
|
||||||
|
d_values_ = DeviceBuffer(values_bytes);
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_rowPtr_.ptr, outer, rowptr_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_colIdx_.ptr, inner, colidx_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, values, values_bytes, cudaMemcpyHostToDevice, stream_));
|
||||||
|
}
|
||||||
|
|
||||||
|
void create_cudss_matrix() {
|
||||||
|
if (d_A_cudss_) (void)cudssMatrixDestroy(d_A_cudss_);
|
||||||
|
|
||||||
|
constexpr cudaDataType_t idx_type = cudss_index_type<StorageIndex>::value;
|
||||||
|
constexpr cudaDataType_t val_type = cuda_data_type<Scalar>::value;
|
||||||
|
constexpr cudssMatrixType_t mtype = Derived::cudss_matrix_type();
|
||||||
|
constexpr cudssMatrixViewType_t mview = Derived::cudss_matrix_view();
|
||||||
|
|
||||||
|
EIGEN_CUDSS_CHECK(cudssMatrixCreateCsr(
|
||||||
|
&d_A_cudss_, static_cast<int64_t>(n_), static_cast<int64_t>(n_), static_cast<int64_t>(nnz_), d_rowPtr_.ptr,
|
||||||
|
/*rowEnd=*/nullptr, d_colIdx_.ptr, d_values_.ptr, idx_type, val_type, mtype, mview, CUDSS_BASE_ZERO));
|
||||||
|
}
|
||||||
|
|
||||||
|
void apply_ordering_config() {
|
||||||
|
cudssAlgType_t alg;
|
||||||
|
switch (ordering_) {
|
||||||
|
case GpuSparseOrdering::AMD:
|
||||||
|
alg = CUDSS_ALG_DEFAULT;
|
||||||
|
break;
|
||||||
|
case GpuSparseOrdering::METIS:
|
||||||
|
alg = CUDSS_ALG_2;
|
||||||
|
break;
|
||||||
|
case GpuSparseOrdering::RCM:
|
||||||
|
alg = CUDSS_ALG_3;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
alg = CUDSS_ALG_DEFAULT;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
EIGEN_CUDSS_CHECK(cudssConfigSet(config_, CUDSS_CONFIG_REORDERING_ALG, &alg, sizeof(alg)));
|
||||||
|
}
|
||||||
|
|
||||||
|
void create_placeholder_dense() {
|
||||||
|
if (d_x_cudss_) (void)cudssMatrixDestroy(d_x_cudss_);
|
||||||
|
if (d_b_cudss_) (void)cudssMatrixDestroy(d_b_cudss_);
|
||||||
|
constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
|
||||||
|
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_x_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
|
||||||
|
dtype, CUDSS_LAYOUT_COL_MAJOR));
|
||||||
|
EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_b_cudss_, static_cast<int64_t>(n_), 1, static_cast<int64_t>(n_), nullptr,
|
||||||
|
dtype, CUDSS_LAYOUT_COL_MAJOR));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_SPARSE_SOLVER_BASE_H
|
||||||
182
Eigen/src/GPU/GpuSupport.h
Normal file
182
Eigen/src/GPU/GpuSupport.h
Normal file
@@ -0,0 +1,182 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Generic CUDA runtime support shared across all GPU library integrations
|
||||||
|
// (cuSOLVER, cuBLAS, cuDSS, etc.):
|
||||||
|
// - Error-checking macros
|
||||||
|
// - RAII device buffer
|
||||||
|
//
|
||||||
|
// Only depends on <cuda_runtime.h>. No NVIDIA library headers.
|
||||||
|
|
||||||
|
#ifndef EIGEN_GPU_SUPPORT_H
|
||||||
|
#define EIGEN_GPU_SUPPORT_H
|
||||||
|
|
||||||
|
// IWYU pragma: private
|
||||||
|
#include "./InternalHeaderCheck.h"
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
// ---- Error-checking macros --------------------------------------------------
|
||||||
|
// These abort (via eigen_assert) on failure. Not for use in destructors.
|
||||||
|
|
||||||
|
#define EIGEN_CUDA_RUNTIME_CHECK(expr) \
|
||||||
|
do { \
|
||||||
|
cudaError_t _e = (expr); \
|
||||||
|
eigen_assert(_e == cudaSuccess && "CUDA runtime call failed"); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
// ---- RAII: device buffer ----------------------------------------------------
|
||||||
|
|
||||||
|
// Thread-local pool of small device buffers to avoid cudaMalloc/cudaFree
|
||||||
|
// overhead for tiny allocations (e.g., DeviceScalar). Buffers up to
|
||||||
|
// kSmallBufferThreshold bytes are recycled; larger allocations bypass the pool.
|
||||||
|
template <size_t SmallBufferThreshold = 256, size_t MaxPoolSize = 64>
|
||||||
|
struct DeviceBufferPool {
|
||||||
|
static constexpr size_t kSmallBufferThreshold = SmallBufferThreshold;
|
||||||
|
static constexpr size_t kMaxPoolSize = MaxPoolSize;
|
||||||
|
|
||||||
|
struct Entry {
|
||||||
|
void* ptr;
|
||||||
|
size_t bytes;
|
||||||
|
};
|
||||||
|
|
||||||
|
~DeviceBufferPool() {
|
||||||
|
for (auto& e : free_list_) (void)cudaFree(e.ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void* allocate(size_t bytes) {
|
||||||
|
// Search for a buffer of sufficient size.
|
||||||
|
for (size_t i = 0; i < free_list_.size(); ++i) {
|
||||||
|
if (free_list_[i].bytes >= bytes) {
|
||||||
|
void* p = free_list_[i].ptr;
|
||||||
|
free_list_[i] = free_list_.back();
|
||||||
|
free_list_.pop_back();
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// No suitable buffer found — allocate new.
|
||||||
|
void* p = nullptr;
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
void deallocate(void* p, size_t bytes) {
|
||||||
|
if (free_list_.size() < kMaxPoolSize) {
|
||||||
|
free_list_.push_back({p, bytes});
|
||||||
|
} else {
|
||||||
|
(void)cudaFree(p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static DeviceBufferPool& threadLocal() {
|
||||||
|
thread_local DeviceBufferPool pool;
|
||||||
|
return pool;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<Entry> free_list_;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct DeviceBuffer {
|
||||||
|
void* ptr = nullptr;
|
||||||
|
|
||||||
|
DeviceBuffer() = default;
|
||||||
|
|
||||||
|
explicit DeviceBuffer(size_t bytes) : size_(bytes) {
|
||||||
|
if (bytes > 0) {
|
||||||
|
if (bytes <= DeviceBufferPool<>::kSmallBufferThreshold) {
|
||||||
|
ptr = DeviceBufferPool<>::threadLocal().allocate(bytes);
|
||||||
|
} else {
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
~DeviceBuffer() {
|
||||||
|
if (ptr) {
|
||||||
|
if (size_ <= DeviceBufferPool<>::kSmallBufferThreshold) {
|
||||||
|
DeviceBufferPool<>::threadLocal().deallocate(ptr, size_);
|
||||||
|
} else {
|
||||||
|
(void)cudaFree(ptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Move-only.
|
||||||
|
DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr), size_(o.size_) {
|
||||||
|
o.ptr = nullptr;
|
||||||
|
o.size_ = 0;
|
||||||
|
}
|
||||||
|
DeviceBuffer& operator=(DeviceBuffer&& o) noexcept {
|
||||||
|
if (this != &o) {
|
||||||
|
if (ptr) {
|
||||||
|
if (size_ <= DeviceBufferPool<>::kSmallBufferThreshold) {
|
||||||
|
DeviceBufferPool<>::threadLocal().deallocate(ptr, size_);
|
||||||
|
} else {
|
||||||
|
(void)cudaFree(ptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ptr = o.ptr;
|
||||||
|
size_ = o.size_;
|
||||||
|
o.ptr = nullptr;
|
||||||
|
o.size_ = 0;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
DeviceBuffer(const DeviceBuffer&) = delete;
|
||||||
|
DeviceBuffer& operator=(const DeviceBuffer&) = delete;
|
||||||
|
|
||||||
|
size_t size() const { return size_; }
|
||||||
|
|
||||||
|
// Adopt an existing device pointer. Caller relinquishes ownership.
|
||||||
|
// Adopted buffers bypass the pool on destruction.
|
||||||
|
static DeviceBuffer adopt(void* p) {
|
||||||
|
DeviceBuffer b;
|
||||||
|
b.ptr = p;
|
||||||
|
b.size_ = DeviceBufferPool<>::kSmallBufferThreshold + 1; // force cudaFree
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
size_t size_ = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---- Scalar → cudaDataType_t ------------------------------------------------
|
||||||
|
// Shared by cuBLAS and cuSOLVER. cudaDataType_t is defined in library_types.h
|
||||||
|
// which is included transitively by cuda_runtime.h.
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
struct cuda_data_type;
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct cuda_data_type<float> {
|
||||||
|
static constexpr cudaDataType_t value = CUDA_R_32F;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cuda_data_type<double> {
|
||||||
|
static constexpr cudaDataType_t value = CUDA_R_64F;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cuda_data_type<std::complex<float>> {
|
||||||
|
static constexpr cudaDataType_t value = CUDA_C_32F;
|
||||||
|
};
|
||||||
|
template <>
|
||||||
|
struct cuda_data_type<std::complex<double>> {
|
||||||
|
static constexpr cudaDataType_t value = CUDA_C_64F;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_GPU_SUPPORT_H
|
||||||
3
Eigen/src/GPU/InternalHeaderCheck.h
Normal file
3
Eigen/src/GPU/InternalHeaderCheck.h
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
#ifndef EIGEN_GPU_MODULE_H
|
||||||
|
#error "Please include Eigen/GPU instead of including headers inside the src/GPU directory directly."
|
||||||
|
#endif
|
||||||
837
Eigen/src/GPU/README.md
Normal file
837
Eigen/src/GPU/README.md
Normal file
@@ -0,0 +1,837 @@
|
|||||||
|
# Eigen GPU Module (`Eigen/GPU`)
|
||||||
|
|
||||||
|
GPU-accelerated linear algebra for Eigen users, dispatching to NVIDIA CUDA
|
||||||
|
libraries (cuBLAS, cuSOLVER, cuFFT, cuSPARSE, cuDSS). Requires CUDA 11.4+;
|
||||||
|
cuDSS features require CUDA 12.0+ and a separate cuDSS install. Header-only.
|
||||||
|
|
||||||
|
## Why this module
|
||||||
|
|
||||||
|
Eigen is the linear algebra foundation for a large ecosystem of C++ projects
|
||||||
|
in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP,
|
||||||
|
Open3D), scientific computing (Ceres, Stan), and beyond. Many of these
|
||||||
|
projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations
|
||||||
|
without dropping down to raw CUDA library APIs.
|
||||||
|
|
||||||
|
GPU sparse solvers are a particularly acute gap. Sparse factorization is the
|
||||||
|
bottleneck in SLAM, bundle adjustment, FEM, and nonlinear optimization --
|
||||||
|
exactly the workloads where GPU acceleration matters most. Downstream projects
|
||||||
|
like [Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
|
||||||
|
[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
|
||||||
|
GPU-accelerated sparse solvers, and third-party projects like
|
||||||
|
[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically because
|
||||||
|
Eigen lacks them. The `Eigen/GPU` module provides GPU sparse Cholesky, LDL^T,
|
||||||
|
and LU factorization via cuDSS, alongside dense solvers (cuSOLVER), matrix
|
||||||
|
products (cuBLAS), FFT (cuFFT), and sparse matrix-vector products (cuSPARSE).
|
||||||
|
|
||||||
|
Existing Eigen users should be able to move performance-critical dense or
|
||||||
|
sparse linear algebra to the GPU with minimal code changes and without
|
||||||
|
learning CUDA library APIs directly.
|
||||||
|
|
||||||
|
## Design philosophy
|
||||||
|
|
||||||
|
**CPU and GPU coexist.** There is no global compile-time switch that replaces
|
||||||
|
CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers
|
||||||
|
explicitly -- `GpuLLT<double>` vs `LLT<MatrixXd>`, `GpuSparseLLT<double>` vs
|
||||||
|
`SimplicialLLT<SparseMatrix<double>>` -- and both coexist in the same binary.
|
||||||
|
This also lets users keep the factored matrix on device across multiple solves,
|
||||||
|
something impossible with compile-time replacement.
|
||||||
|
|
||||||
|
**Familiar syntax.** GPU operations use the same expression patterns as CPU
|
||||||
|
Eigen. Here is a side-by-side comparison:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
// ---- CPU (Eigen) ---- // ---- GPU (Eigen/GPU) ----
|
||||||
|
#include <Eigen/Dense> #define EIGEN_USE_GPU
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
// Dense
|
||||||
|
MatrixXd A = ...; auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||||
|
MatrixXd B = ...; auto d_B = DeviceMatrix<double>::fromHost(B);
|
||||||
|
|
||||||
|
MatrixXd C = A * B; DeviceMatrix<double> d_C = d_A * d_B;
|
||||||
|
MatrixXd X = A.llt().solve(B); DeviceMatrix<double> d_X = d_A.llt().solve(d_B);
|
||||||
|
|
||||||
|
MatrixXd X = d_X.toHost();
|
||||||
|
|
||||||
|
// Sparse (using SpMat = SparseMatrix<double>)
|
||||||
|
SimplicialLLT<SpMat> llt(A); GpuSparseLLT<double> llt(A);
|
||||||
|
VectorXd x = llt.solve(b); VectorXd x = llt.solve(b);
|
||||||
|
```
|
||||||
|
|
||||||
|
The GPU version reads like CPU Eigen with explicit upload/download for dense
|
||||||
|
operations, and an almost identical API for sparse solvers. Unsupported
|
||||||
|
expressions are compile errors.
|
||||||
|
|
||||||
|
**Standalone module.** `Eigen/GPU` does not modify or depend on Eigen's Core
|
||||||
|
expression template system (`MatrixBase`, `CwiseBinaryOp`, etc.).
|
||||||
|
`DeviceMatrix` is not an Eigen expression type and does not inherit from
|
||||||
|
`MatrixBase`. The expression layer is a thin compile-time dispatch where every
|
||||||
|
supported expression maps to a single NVIDIA library call. There is no
|
||||||
|
coefficient-level evaluation, lazy fusion, or packet operations.
|
||||||
|
|
||||||
|
**Interoperability where useful.** `DeviceMatrix` provides the same operator
|
||||||
|
signatures as `Matrix` for common vector operations: `+=`, `-=`, `*=`,
|
||||||
|
`dot()`, `squaredNorm()`, `norm()`, `setZero()`, and `noalias()`. This makes
|
||||||
|
`DeviceMatrix` usable as a drop-in `VectorType` in Eigen algorithm templates
|
||||||
|
that rely on these operations. For example, Eigen's `conjugate_gradient()`
|
||||||
|
template works with `DeviceMatrix` with a single typedef change -- no
|
||||||
|
modifications to the algorithm or the expression template system. Conjugate
|
||||||
|
gradient is just the motivating example; we are open to expanding operator
|
||||||
|
coverage as needed to support other high-level Eigen algorithms on the GPU.
|
||||||
|
|
||||||
|
**Explicit over implicit.** Host-device transfers, stream management, and
|
||||||
|
library handle lifetimes are visible in the API. There are no hidden
|
||||||
|
allocations or synchronizations except where documented (e.g., `toHost()` must
|
||||||
|
synchronize to deliver data to the host).
|
||||||
|
|
||||||
|
## Key concepts
|
||||||
|
|
||||||
|
### `DeviceMatrix<Scalar>`
|
||||||
|
|
||||||
|
A typed RAII wrapper for a dense column-major matrix in GPU device memory.
|
||||||
|
This is the GPU counterpart of Eigen's `MatrixX<Scalar>`. A vector is simply
|
||||||
|
a `DeviceMatrix` with one column.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
// Upload from host
|
||||||
|
auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||||
|
|
||||||
|
// Allocate uninitialized
|
||||||
|
DeviceMatrix<double> d_C(m, n);
|
||||||
|
|
||||||
|
// Download to host
|
||||||
|
MatrixXd C = d_C.toHost();
|
||||||
|
|
||||||
|
// Async download (returns a future)
|
||||||
|
auto transfer = d_C.toHostAsync();
|
||||||
|
// ... do other work ...
|
||||||
|
MatrixXd C = transfer.get();
|
||||||
|
```
|
||||||
|
|
||||||
|
`DeviceMatrix` supports expression methods that mirror Eigen's API:
|
||||||
|
`adjoint()`, `transpose()`, `triangularView<UpLo>()`,
|
||||||
|
`selfadjointView<UpLo>()`, `llt()`, `lu()`. These return lightweight
|
||||||
|
expression objects that are evaluated when assigned.
|
||||||
|
|
||||||
|
For BLAS Level-1 operations, `DeviceMatrix` also provides `dot()`, `norm()`,
|
||||||
|
`squaredNorm()`, `setZero()`, `noalias()`, and arithmetic operators
|
||||||
|
(`+=`, `-=`, `*=`) that dispatch to cuBLAS `axpy`, `nrm2`, `dot`, and
|
||||||
|
`geam`. These are the operations needed by iterative solvers.
|
||||||
|
|
||||||
|
### `DeviceScalar<Scalar>`
|
||||||
|
|
||||||
|
A device-resident scalar value. Reductions like `dot()`, `norm()`, and
|
||||||
|
`squaredNorm()` return `DeviceScalar` instead of a host scalar, deferring
|
||||||
|
the host synchronization until the value is actually needed:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
auto dot_val = d_x.dot(d_y); // DeviceScalar -- no sync
|
||||||
|
auto norm_sq = d_r.squaredNorm(); // DeviceScalar -- no sync
|
||||||
|
Scalar alpha = dot_val / norm_sq; // sync here (implicit conversion)
|
||||||
|
d_x += alpha * d_p; // host scalar * DeviceMatrix (axpy)
|
||||||
|
```
|
||||||
|
|
||||||
|
Division between `DeviceScalar` values (real types only) is performed on
|
||||||
|
device via NPP, avoiding extra synchronizations.
|
||||||
|
|
||||||
|
### `GpuContext`
|
||||||
|
|
||||||
|
Every GPU operation needs a CUDA stream and library handles (cuBLAS,
|
||||||
|
cuSOLVER). `GpuContext` bundles these together.
|
||||||
|
|
||||||
|
For simple usage, you don't need to create one -- a per-thread default context
|
||||||
|
is created lazily on first use:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
// These use the thread-local default context automatically
|
||||||
|
d_C = d_A * d_B;
|
||||||
|
d_X = d_A.llt().solve(d_B);
|
||||||
|
```
|
||||||
|
|
||||||
|
For concurrent multi-stream execution, create explicit contexts:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuContext ctx1, ctx2;
|
||||||
|
d_C1.device(ctx1) = d_A1 * d_B1; // runs on stream 1
|
||||||
|
d_C2.device(ctx2) = d_A2 * d_B2; // runs on stream 2 (concurrently)
|
||||||
|
```
|
||||||
|
|
||||||
|
To integrate with existing CUDA code, borrow an existing stream:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuContext ctx(my_existing_stream); // wraps stream, does not take ownership
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Matrix operations (cuBLAS)
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<double>::fromHost(B);
|
||||||
|
|
||||||
|
// GEMM: C = A * B, C = A^H * B, C = A * B^T, ...
|
||||||
|
DeviceMatrix<double> d_C = d_A * d_B;
|
||||||
|
d_C = d_A.adjoint() * d_B;
|
||||||
|
d_C = d_A * d_B.transpose();
|
||||||
|
|
||||||
|
// Scaled and accumulated
|
||||||
|
d_C += 2.0 * d_A * d_B; // alpha=2, beta=1
|
||||||
|
d_C.device(ctx) -= d_A * d_B; // alpha=-1, beta=1 (GEMM requires explicit context for -=)
|
||||||
|
|
||||||
|
// Triangular solve (TRSM)
|
||||||
|
d_X = d_A.triangularView<Lower>().solve(d_B);
|
||||||
|
|
||||||
|
// Symmetric/Hermitian multiply (SYMM/HEMM)
|
||||||
|
d_C = d_A.selfadjointView<Lower>() * d_B;
|
||||||
|
|
||||||
|
// Rank-k update (SYRK/HERK)
|
||||||
|
d_C.selfadjointView<Lower>().rankUpdate(d_A); // C += A * A^H
|
||||||
|
```
|
||||||
|
|
||||||
|
### BLAS Level-1 operations
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
// Dot product and norms (return DeviceScalar -- no sync until read)
|
||||||
|
auto dot_val = d_x.dot(d_y); // cublasDdot / cublasCdotc
|
||||||
|
auto norm_val = d_r.norm(); // cublasDnrm2
|
||||||
|
double n = norm_val; // implicit conversion triggers sync
|
||||||
|
|
||||||
|
// Vector arithmetic (cuBLAS axpy / geam)
|
||||||
|
d_x += alpha * d_p; // axpy: x = x + alpha * p
|
||||||
|
d_x -= alpha * d_p; // axpy: x = x - alpha * p
|
||||||
|
d_x *= alpha; // scal: x = alpha * x
|
||||||
|
d_r.setZero(); // cudaMemsetAsync
|
||||||
|
|
||||||
|
// DeviceScalar arithmetic (stays on device, real types only)
|
||||||
|
auto alpha = absNew / dot_val; // device-side division via NPP
|
||||||
|
d_x += alpha * d_p; // DeviceScalar * DeviceMatrix (axpy with device pointer)
|
||||||
|
|
||||||
|
// Matrix add/subtract (cuBLAS geam)
|
||||||
|
DeviceMatrix<double> d_C = d_A + d_B; // C = A + B
|
||||||
|
d_C = d_A + 2.0 * d_B; // C = A + 2*B
|
||||||
|
d_C = d_A - d_B; // C = A - B
|
||||||
|
```
|
||||||
|
|
||||||
|
### Dense solvers (cuSOLVER)
|
||||||
|
|
||||||
|
**One-shot expression syntax** -- Convenient, re-factorizes each time:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
// Cholesky solve (potrf + potrs)
|
||||||
|
d_X = d_A.llt().solve(d_B);
|
||||||
|
|
||||||
|
// LU solve (getrf + getrs)
|
||||||
|
d_Y = d_A.lu().solve(d_B);
|
||||||
|
```
|
||||||
|
|
||||||
|
**Cached factorization** -- Factor once, solve many times:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuLLT<double> llt;
|
||||||
|
llt.compute(d_A); // factorize (async)
|
||||||
|
if (llt.info() != Success) { ... } // lazy sync on first info() call
|
||||||
|
auto d_X1 = llt.solve(d_B1); // reuses factor (async)
|
||||||
|
auto d_X2 = llt.solve(d_B2); // reuses factor (async)
|
||||||
|
MatrixXd X2 = d_X2.toHost();
|
||||||
|
|
||||||
|
// LU with transpose solve
|
||||||
|
GpuLU<double> lu;
|
||||||
|
lu.compute(d_A);
|
||||||
|
auto d_Y = lu.solve(d_B, GpuLU<double>::Transpose); // A^T Y = B
|
||||||
|
|
||||||
|
// QR solve (overdetermined least squares)
|
||||||
|
GpuQR<double> qr;
|
||||||
|
qr.compute(d_A); // factorize on device (async)
|
||||||
|
auto d_X = qr.solve(d_B); // Q^H * B via ormqr, then trsm on R
|
||||||
|
MatrixXd X = d_X.toHost();
|
||||||
|
|
||||||
|
// SVD (results downloaded on access)
|
||||||
|
GpuSVD<double> svd;
|
||||||
|
svd.compute(d_A, ComputeThinU | ComputeThinV);
|
||||||
|
VectorXd S = svd.singularValues(); // downloads to host
|
||||||
|
MatrixXd U = svd.matrixU(); // downloads to host
|
||||||
|
MatrixXd V = svd.matrixV(); // V (matches JacobiSVD)
|
||||||
|
MatrixXd VT = svd.matrixVT(); // V^T (matches cuSOLVER)
|
||||||
|
|
||||||
|
// Self-adjoint eigenvalue decomposition (results downloaded on access)
|
||||||
|
GpuSelfAdjointEigenSolver<double> es;
|
||||||
|
es.compute(d_A);
|
||||||
|
VectorXd eigenvals = es.eigenvalues(); // downloads to host
|
||||||
|
MatrixXd eigenvecs = es.eigenvectors(); // downloads to host
|
||||||
|
```
|
||||||
|
|
||||||
|
The cached API keeps the factored matrix on device, avoiding redundant
|
||||||
|
host-device transfers and re-factorizations. All solvers also accept host
|
||||||
|
matrices directly as a convenience (e.g., `GpuLLT<double> llt(A)` or
|
||||||
|
`qr.solve(B)`), which handles upload/download internally.
|
||||||
|
|
||||||
|
### Sparse direct solvers (cuDSS)
|
||||||
|
|
||||||
|
Requires cuDSS (separate install, CUDA 12.0+). Define `EIGEN_CUDSS` before
|
||||||
|
including `Eigen/GPU` and link with `-lcudss`.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
SparseMatrix<double> A = ...; // symmetric positive definite
|
||||||
|
VectorXd b = ...;
|
||||||
|
|
||||||
|
// Sparse Cholesky -- one-liner
|
||||||
|
GpuSparseLLT<double> llt(A);
|
||||||
|
VectorXd x = llt.solve(b);
|
||||||
|
|
||||||
|
// Three-phase workflow for repeated solves with the same sparsity pattern
|
||||||
|
GpuSparseLLT<double> llt;
|
||||||
|
llt.analyzePattern(A); // symbolic analysis (once)
|
||||||
|
llt.factorize(A); // numeric factorization
|
||||||
|
VectorXd x = llt.solve(b);
|
||||||
|
llt.factorize(A_new_values); // refactorize (reuses symbolic analysis)
|
||||||
|
VectorXd x2 = llt.solve(b);
|
||||||
|
|
||||||
|
// Sparse LDL^T (symmetric indefinite)
|
||||||
|
GpuSparseLDLT<double> ldlt(A);
|
||||||
|
VectorXd x = ldlt.solve(b);
|
||||||
|
|
||||||
|
// Sparse LU (general non-symmetric)
|
||||||
|
GpuSparseLU<double> lu(A);
|
||||||
|
VectorXd x = lu.solve(b);
|
||||||
|
```
|
||||||
|
|
||||||
|
### FFT (cuFFT)
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuFFT<float> fft;
|
||||||
|
|
||||||
|
// 1D complex-to-complex
|
||||||
|
VectorXcf X = fft.fwd(x); // forward
|
||||||
|
VectorXcf y = fft.inv(X); // inverse (scaled by 1/n)
|
||||||
|
|
||||||
|
// 1D real-to-complex / complex-to-real
|
||||||
|
VectorXcf R = fft.fwd(r); // returns n/2+1 complex (half-spectrum)
|
||||||
|
VectorXf s = fft.invReal(R, n); // C2R inverse, caller specifies n
|
||||||
|
|
||||||
|
// 2D complex-to-complex
|
||||||
|
MatrixXcf B = fft.fwd2d(A); // 2D forward
|
||||||
|
MatrixXcf C = fft.inv2d(B); // 2D inverse (scaled by 1/(rows*cols))
|
||||||
|
|
||||||
|
// Plans are cached and reused across calls with the same size/type.
|
||||||
|
```
|
||||||
|
|
||||||
|
### Sparse matrix-vector multiply (cuSPARSE)
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
SparseMatrix<double> A = ...;
|
||||||
|
VectorXd x = ...;
|
||||||
|
|
||||||
|
// Host vectors (upload/download handled internally)
|
||||||
|
GpuSparseContext<double> spmv;
|
||||||
|
VectorXd y = spmv.multiply(A, x); // y = A * x
|
||||||
|
VectorXd z = spmv.multiplyT(A, x); // z = A^T * x
|
||||||
|
spmv.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y
|
||||||
|
MatrixXd Y = spmv.multiplyMat(A, X); // Y = A * X (SpMM)
|
||||||
|
|
||||||
|
// Device-resident SpMV (sparse matrix cached on device)
|
||||||
|
GpuSparseContext<double> spmv(ctx); // share GpuContext for same-stream
|
||||||
|
auto d_A = spmv.deviceView(A); // upload sparse matrix once
|
||||||
|
d_y = d_A * d_x; // operator syntax, stays on device
|
||||||
|
```
|
||||||
|
|
||||||
|
### Eigen algorithm interop (example: Conjugate gradient)
|
||||||
|
|
||||||
|
The BLAS-1 operators and `DeviceSparseView` make `DeviceMatrix` usable as a
|
||||||
|
vector type in GPU implementations of algorithms like conjugate gradient.
|
||||||
|
Conjugate gradient is the motivating example -- a GPU CG implementation
|
||||||
|
uses the same operations as the CPU version:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuContext ctx;
|
||||||
|
GpuSparseContext<double> spmv(ctx);
|
||||||
|
auto d_A = spmv.deviceView(A); // sparse matrix on device
|
||||||
|
auto d_b = DeviceMatrix<double>::fromHost(b);
|
||||||
|
auto d_x = DeviceMatrix<double>::fromHost(x0);
|
||||||
|
|
||||||
|
// CG iteration using DeviceMatrix operators
|
||||||
|
DeviceMatrix<double> d_r = d_b; // r = b (deep copy via geam)
|
||||||
|
DeviceMatrix<double> d_p(n), d_tmp(n);
|
||||||
|
d_tmp = d_A * d_x; // SpMV (device-resident)
|
||||||
|
d_r -= d_tmp; // axpy
|
||||||
|
d_p = d_r.clone();
|
||||||
|
RealScalar absNew = d_r.squaredNorm(); // DeviceScalar -> implicit sync
|
||||||
|
|
||||||
|
for (int i = 0; i < maxIters && absNew > tol * tol; ++i) {
|
||||||
|
d_tmp = d_A * d_p; // SpMV
|
||||||
|
auto alpha = absNew / d_p.dot(d_tmp); // host / DeviceScalar -> DeviceScalar
|
||||||
|
d_x += alpha * d_p; // axpy with DeviceScalar
|
||||||
|
d_r -= alpha * d_tmp; // axpy with DeviceScalar
|
||||||
|
RealScalar absOld = absNew;
|
||||||
|
absNew = d_r.squaredNorm(); // DeviceScalar -> implicit sync
|
||||||
|
d_p *= Scalar(absNew / absOld); // scal (host scalars)
|
||||||
|
d_p += d_r; // axpy
|
||||||
|
}
|
||||||
|
MatrixXd x = d_x.toHost();
|
||||||
|
```
|
||||||
|
|
||||||
|
### Precision control
|
||||||
|
|
||||||
|
GEMM dispatch uses `cublasLtMatmul` with heuristic algorithm selection,
|
||||||
|
enabling cuBLAS to choose tensor core algorithms when beneficial. For double
|
||||||
|
precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64 results
|
||||||
|
computed faster via tensor cores.
|
||||||
|
|
||||||
|
| Macro | Effect |
|
||||||
|
|---|---|
|
||||||
|
| *(default)* | Tensor core algorithms enabled. Float uses full FP32. Double may use Ozaki on sm_80+. |
|
||||||
|
| `EIGEN_CUDA_TF32` | Opt-in: Float uses TF32 (~2x faster, 10-bit mantissa). Double unaffected. |
|
||||||
|
| `EIGEN_NO_CUDA_TENSOR_OPS` | Opt-out: Pedantic compute types, no tensor cores. For bit-exact reproducibility. |
|
||||||
|
|
||||||
|
### Stream control and async execution
|
||||||
|
|
||||||
|
Operations are asynchronous by default. The compute-solve chain runs without
|
||||||
|
host synchronization until you need a result on the host:
|
||||||
|
|
||||||
|
```
|
||||||
|
fromHost(A) --sync--> compute() --async--> solve() --async--> toHost()
|
||||||
|
H2D potrf potrs D2H
|
||||||
|
sync
|
||||||
|
```
|
||||||
|
|
||||||
|
Mandatory sync points:
|
||||||
|
- `fromHost()` -- Synchronizes to complete the upload before returning
|
||||||
|
- `toHost()` / `HostTransfer::get()` -- Must deliver data to host
|
||||||
|
- `info()` -- Must read the factorization status
|
||||||
|
- `DeviceScalar` implicit conversion -- Downloads scalar from device
|
||||||
|
|
||||||
|
**Cross-stream safety** is automatic. `DeviceMatrix` tracks write completion
|
||||||
|
via CUDA events. When a matrix written on stream A is read on stream B, the
|
||||||
|
module automatically inserts `cudaStreamWaitEvent`. Same-stream operations
|
||||||
|
skip the wait (CUDA guarantees in-order execution within a stream).
|
||||||
|
|
||||||
|
## Reference
|
||||||
|
|
||||||
|
### Supported scalar types
|
||||||
|
|
||||||
|
`float`, `double`, `std::complex<float>`, `std::complex<double>` (unless
|
||||||
|
noted otherwise).
|
||||||
|
|
||||||
|
### Expression -> library call mapping
|
||||||
|
|
||||||
|
| DeviceMatrix expression | Library call | Parameters |
|
||||||
|
|---|---|---|
|
||||||
|
| `C = A * B` | `cublasLtMatmul` | transA=N, transB=N, alpha=1, beta=0 |
|
||||||
|
| `C = A.adjoint() * B` | `cublasLtMatmul` | transA=C, transB=N |
|
||||||
|
| `C = A.transpose() * B` | `cublasLtMatmul` | transA=T, transB=N |
|
||||||
|
| `C = A * B.adjoint()` | `cublasLtMatmul` | transA=N, transB=C |
|
||||||
|
| `C = A * B.transpose()` | `cublasLtMatmul` | transA=N, transB=T |
|
||||||
|
| `C = alpha * A * B` | `cublasLtMatmul` | alpha from LHS |
|
||||||
|
| `C = A * (alpha * B)` | `cublasLtMatmul` | alpha from RHS |
|
||||||
|
| `C += A * B` | `cublasLtMatmul` | alpha=1, beta=1 |
|
||||||
|
| `C.device(ctx) -= A * B` | `cublasLtMatmul` | alpha=-1, beta=1 |
|
||||||
|
| `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs |
|
||||||
|
| `X = A.llt<Upper>().solve(B)` | same | uplo=Upper |
|
||||||
|
| `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs |
|
||||||
|
| `X = A.triangularView<L>().solve(B)` | `cublasXtrsm` | side=L, uplo, diag=NonUnit |
|
||||||
|
| `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
|
||||||
|
| `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
|
||||||
|
| `C = A + B` | `cublasXgeam` | alpha=1, beta=1 |
|
||||||
|
| `C = A + alpha * B` | `cublasXgeam` | alpha=1, beta from scaled |
|
||||||
|
| `C = A - B` | `cublasXgeam` | alpha=1, beta=-1 |
|
||||||
|
| `C = A - alpha * B` | `cublasXgeam` | alpha=1, beta=-scaled |
|
||||||
|
| `x += alpha * y` | `cublasXaxpy` | alpha (host scalar) |
|
||||||
|
| `x += dAlpha * y` | `cublasXaxpy` | alpha (DeviceScalar, device pointer mode) |
|
||||||
|
| `x -= alpha * y` | `cublasXaxpy` | alpha negated |
|
||||||
|
| `x *= alpha` | `cublasXscal` | alpha (host or DeviceScalar) |
|
||||||
|
| `x.dot(y)` | `cublasXdot` / `cublasXdotc` | returns `DeviceScalar` |
|
||||||
|
| `x.norm()` | `cublasXnrm2` | returns `DeviceScalar<RealScalar>` |
|
||||||
|
| `x.squaredNorm()` | `cublasXdot(x, x)` | returns `DeviceScalar<RealScalar>` |
|
||||||
|
| `d_y = view * d_x` | `cusparseSpMV` | device-resident SpMV |
|
||||||
|
|
||||||
|
### `DeviceMatrix<Scalar>`
|
||||||
|
|
||||||
|
Typed RAII wrapper for a dense column-major matrix in GPU device memory.
|
||||||
|
Always dense (leading dimension = rows). A vector is a `DeviceMatrix` with
|
||||||
|
one column.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
// Construction
|
||||||
|
DeviceMatrix<Scalar>() // Empty (0x0)
|
||||||
|
DeviceMatrix<Scalar>(Index n) // Allocate column vector (n x 1)
|
||||||
|
DeviceMatrix<Scalar>(rows, cols) // Allocate uninitialized
|
||||||
|
|
||||||
|
// Upload / download
|
||||||
|
static DeviceMatrix fromHost(matrix, stream=nullptr) // -> DeviceMatrix (syncs)
|
||||||
|
static DeviceMatrix fromHostAsync(ptr, rows, cols, stream) // -> DeviceMatrix (no sync, caller manages ptr lifetime)
|
||||||
|
PlainMatrix toHost(stream=nullptr) // -> host Matrix (syncs)
|
||||||
|
HostTransfer toHostAsync(stream=nullptr) // -> HostTransfer future (no sync)
|
||||||
|
DeviceMatrix clone(stream=nullptr) // -> DeviceMatrix (D2D copy, async)
|
||||||
|
|
||||||
|
// Dimensions and access
|
||||||
|
Index rows()
|
||||||
|
Index cols()
|
||||||
|
size_t sizeInBytes()
|
||||||
|
bool empty()
|
||||||
|
Scalar* data() // Raw device pointer
|
||||||
|
void resize(Index rows, Index cols) // Discard contents, reallocate
|
||||||
|
|
||||||
|
// Expression builders (return lightweight views, evaluated on assignment)
|
||||||
|
AdjointView adjoint() // GEMM with ConjTrans
|
||||||
|
TransposeView transpose() // GEMM with Trans
|
||||||
|
LltExpr llt() / llt<UpLo>() // -> .solve(d_B) -> DeviceMatrix
|
||||||
|
LuExpr lu() // -> .solve(d_B) -> DeviceMatrix
|
||||||
|
TriangularView triangularView<UpLo>() // -> .solve(d_B) -> DeviceMatrix (TRSM)
|
||||||
|
SelfAdjointView selfadjointView<UpLo>() // -> * d_B (SYMM), .rankUpdate(d_A) (SYRK)
|
||||||
|
DeviceAssignment device(GpuContext& ctx) // Bind assignment to explicit stream
|
||||||
|
DeviceMatrix& noalias() // No-op (all ops are implicitly noalias)
|
||||||
|
|
||||||
|
// BLAS Level-1 (all have overloads with explicit GpuContext& parameter)
|
||||||
|
DeviceScalar<Scalar> dot(const DeviceMatrix& other) // cuBLAS dot/dotc -> DeviceScalar
|
||||||
|
DeviceScalar<RealScalar> norm() // cuBLAS nrm2 -> DeviceScalar
|
||||||
|
DeviceScalar<RealScalar> squaredNorm() // dot(self, self) -> DeviceScalar (no sync)
|
||||||
|
void setZero() // cudaMemsetAsync
|
||||||
|
void addScaled(GpuContext&, Scalar alpha, const DeviceMatrix& x) // this += alpha * x (axpy)
|
||||||
|
void scale(GpuContext&, Scalar alpha) // this *= alpha (scal)
|
||||||
|
void copyFrom(GpuContext&, const DeviceMatrix& other) // this = other (D2D copy)
|
||||||
|
DeviceMatrix& operator+=(Scalar * DeviceMatrix) // cuBLAS axpy
|
||||||
|
DeviceMatrix& operator-=(Scalar * DeviceMatrix) // cuBLAS axpy (negated)
|
||||||
|
DeviceMatrix& operator+=(const DeviceMatrix&) // cuBLAS axpy
|
||||||
|
DeviceMatrix& operator-=(const DeviceMatrix&) // cuBLAS axpy
|
||||||
|
DeviceMatrix& operator+=(const DeviceScaledDevice&) // cuBLAS axpy (DeviceScalar * DeviceMatrix)
|
||||||
|
DeviceMatrix& operator-=(const DeviceScaledDevice&) // cuBLAS axpy (DeviceScalar * DeviceMatrix, negated)
|
||||||
|
DeviceMatrix& operator*=(Scalar) // cuBLAS scal
|
||||||
|
DeviceMatrix& operator*=(const DeviceScalar<Scalar>&) // cuBLAS scal (device pointer)
|
||||||
|
DeviceMatrix cwiseProduct(GpuContext&, const DeviceMatrix&) // NPP nppsMul (float/double only)
|
||||||
|
void cwiseProduct(GpuContext&, const DeviceMatrix&, const DeviceMatrix&) // in-place: this = a .* b
|
||||||
|
|
||||||
|
// geam expressions (evaluated on assignment)
|
||||||
|
DeviceMatrix& operator=(const DeviceAddExpr&) // C = A + B, C = A + alpha*B, C = A - B, etc.
|
||||||
|
```
|
||||||
|
|
||||||
|
### `DeviceScalar<Scalar>`
|
||||||
|
|
||||||
|
Device-resident scalar. Returned by `dot()`, `norm()`, and `squaredNorm()`.
|
||||||
|
Implicit conversion to `Scalar` triggers `cudaStreamSynchronize` + download.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
DeviceScalar(cudaStream_t stream = nullptr) // Allocate uninitialized
|
||||||
|
DeviceScalar(Scalar host_val, cudaStream_t stream) // Upload host value
|
||||||
|
|
||||||
|
Scalar get() // Download (syncs stream)
|
||||||
|
operator Scalar() // Implicit conversion (syncs)
|
||||||
|
Scalar* devicePtr() // Raw device pointer
|
||||||
|
cudaStream_t stream()
|
||||||
|
|
||||||
|
// Device-side arithmetic (no host sync, real types only)
|
||||||
|
DeviceScalar operator/(DeviceScalar, DeviceScalar) // NPP nppsDiv
|
||||||
|
DeviceScalar operator/(Scalar, DeviceScalar) // upload + div
|
||||||
|
DeviceScalar operator/(DeviceScalar, Scalar) // upload + div
|
||||||
|
DeviceScalar operator-() // NPP nppsMulC(-1)
|
||||||
|
```
|
||||||
|
|
||||||
|
### `GpuContext`
|
||||||
|
|
||||||
|
Unified GPU execution context owning a CUDA stream and library handles.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuContext() // Creates dedicated stream + handles
|
||||||
|
GpuContext(cudaStream_t stream) // Borrow existing stream (not owned)
|
||||||
|
static GpuContext& threadLocal() // Per-thread default (lazy-created)
|
||||||
|
static void setThreadLocal(GpuContext* ctx) // Override thread-local default (nullptr restores)
|
||||||
|
|
||||||
|
cudaStream_t stream()
|
||||||
|
cublasHandle_t cublasHandle()
|
||||||
|
cusolverDnHandle_t cusolverHandle()
|
||||||
|
cublasLtHandle_t cublasLtHandle() // Lazy-initialized
|
||||||
|
cusparseHandle_t cusparseHandle() // Lazy-initialized
|
||||||
|
```
|
||||||
|
|
||||||
|
Non-copyable, non-movable (owns library handles).
|
||||||
|
|
||||||
|
### `GpuLLT<Scalar, UpLo>` -- Dense Cholesky (cuSOLVER)
|
||||||
|
|
||||||
|
Caches the Cholesky factor on device for repeated solves.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuLLT() // Default construct, then call compute()
|
||||||
|
GpuLLT(const EigenBase<D>& A) // Convenience: upload + factorize
|
||||||
|
|
||||||
|
GpuLLT& compute(const EigenBase<D>& A) // Upload + factorize
|
||||||
|
GpuLLT& compute(const DeviceMatrix& d_A) // D2D copy + factorize
|
||||||
|
GpuLLT& compute(DeviceMatrix&& d_A) // Adopt + factorize (no copy)
|
||||||
|
|
||||||
|
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
|
||||||
|
DeviceMatrix solve(const DeviceMatrix& d_B) // -> DeviceMatrix (async, stays on device)
|
||||||
|
|
||||||
|
ComputationInfo info() // Lazy sync on first call: Success or NumericalIssue
|
||||||
|
Index rows() / cols()
|
||||||
|
cudaStream_t stream()
|
||||||
|
```
|
||||||
|
|
||||||
|
### `GpuLU<Scalar>` -- Dense LU (cuSOLVER)
|
||||||
|
|
||||||
|
Same pattern as `GpuLLT`. Adds `TransposeMode` parameter on `solve()`.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
PlainMatrix solve(const MatrixBase<D>& B, TransposeMode m = NoTranspose) // -> host Matrix
|
||||||
|
DeviceMatrix solve(const DeviceMatrix& d_B, TransposeMode m = NoTranspose) // -> DeviceMatrix
|
||||||
|
```
|
||||||
|
|
||||||
|
`TransposeMode`: `NoTranspose`, `Transpose`, `ConjugateTranspose`.
|
||||||
|
|
||||||
|
### `GpuQR<Scalar>` -- Dense QR (cuSOLVER)
|
||||||
|
|
||||||
|
QR factorization via `cusolverDnXgeqrf`. Solve uses ORMQR (apply Q^H) + TRSM
|
||||||
|
(back-substitute on R) -- Q is never formed explicitly.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuQR() // Default construct
|
||||||
|
GpuQR(const EigenBase<D>& A) // Convenience: upload + factorize
|
||||||
|
|
||||||
|
GpuQR& compute(const EigenBase<D>& A) // Upload + factorize
|
||||||
|
GpuQR& compute(const DeviceMatrix& d_A) // D2D copy + factorize
|
||||||
|
|
||||||
|
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
|
||||||
|
DeviceMatrix solve(const DeviceMatrix& d_B) // -> DeviceMatrix (async)
|
||||||
|
|
||||||
|
ComputationInfo info() // Lazy sync
|
||||||
|
Index rows() / cols()
|
||||||
|
cudaStream_t stream()
|
||||||
|
```
|
||||||
|
|
||||||
|
### `GpuSVD<Scalar>` -- Dense SVD (cuSOLVER)
|
||||||
|
|
||||||
|
SVD via `cusolverDnXgesvd`. Supports `ComputeThinU | ComputeThinV`,
|
||||||
|
`ComputeFullU | ComputeFullV`, or `0` (values only). Wide matrices (m < n)
|
||||||
|
handled by internal transpose.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuSVD() // Default construct, then call compute()
|
||||||
|
GpuSVD(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV) // Convenience
|
||||||
|
|
||||||
|
GpuSVD& compute(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV)
|
||||||
|
GpuSVD& compute(const DeviceMatrix& d_A, unsigned options = ComputeThinU | ComputeThinV)
|
||||||
|
|
||||||
|
RealVector singularValues() // -> host vector (syncs, downloads)
|
||||||
|
PlainMatrix matrixU() // -> host Matrix (syncs, downloads)
|
||||||
|
PlainMatrix matrixV() // -> host Matrix (V = VT^H, matches JacobiSVD)
|
||||||
|
PlainMatrix matrixVT() // -> host Matrix (syncs, downloads V^T)
|
||||||
|
|
||||||
|
PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (pseudoinverse)
|
||||||
|
PlainMatrix solve(const MatrixBase<D>& B, Index k) // Truncated (top k triplets)
|
||||||
|
PlainMatrix solve(const MatrixBase<D>& B, RealScalar l) // Tikhonov regularized
|
||||||
|
|
||||||
|
Index rank(RealScalar threshold = -1)
|
||||||
|
ComputationInfo info() // Lazy sync
|
||||||
|
Index rows() / cols()
|
||||||
|
cudaStream_t stream()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** `singularValues()`, `matrixU()`, `matrixV()`, and `matrixVT()`
|
||||||
|
download to host on each call. Device-side accessors returning `DeviceMatrix`
|
||||||
|
are planned but not yet implemented.
|
||||||
|
|
||||||
|
### `GpuSelfAdjointEigenSolver<Scalar>` -- Eigendecomposition (cuSOLVER)
|
||||||
|
|
||||||
|
Symmetric/Hermitian eigenvalue decomposition via `cusolverDnXsyevd`.
|
||||||
|
`ComputeMode` enum: `EigenvaluesOnly`, `ComputeEigenvectors`.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuSelfAdjointEigenSolver() // Default construct, then call compute()
|
||||||
|
GpuSelfAdjointEigenSolver(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors) // Convenience
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver& compute(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors)
|
||||||
|
GpuSelfAdjointEigenSolver& compute(const DeviceMatrix& d_A, ComputeMode mode = ComputeEigenvectors)
|
||||||
|
|
||||||
|
RealVector eigenvalues() // -> host vector (syncs, downloads, ascending order)
|
||||||
|
PlainMatrix eigenvectors() // -> host Matrix (syncs, downloads, columns)
|
||||||
|
|
||||||
|
ComputationInfo info() // Lazy sync
|
||||||
|
Index rows() / cols()
|
||||||
|
cudaStream_t stream()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** `eigenvalues()` and `eigenvectors()` download to host on each call.
|
||||||
|
Device-side accessors returning `DeviceMatrix` are planned but not yet
|
||||||
|
implemented.
|
||||||
|
|
||||||
|
### `HostTransfer<Scalar>`
|
||||||
|
|
||||||
|
Future for async device-to-host transfer. Returned by
|
||||||
|
`DeviceMatrix::toHostAsync()`.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
PlainMatrix& get() // Block until complete, return host Matrix ref. Idempotent.
|
||||||
|
bool ready() // Non-blocking poll
|
||||||
|
```
|
||||||
|
|
||||||
|
### `GpuSparseLLT<Scalar, UpLo>` -- Sparse Cholesky (cuDSS)
|
||||||
|
|
||||||
|
Requires cuDSS (CUDA 12.0+, `#define EIGEN_CUDSS`). Three-phase workflow
|
||||||
|
with symbolic reuse. Accepts `SparseMatrix<Scalar, ColMajor, int>` (CSC).
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuSparseLLT() // Default construct
|
||||||
|
GpuSparseLLT(const SparseMatrixBase<D>& A) // Analyze + factorize
|
||||||
|
|
||||||
|
GpuSparseLLT& analyzePattern(const SparseMatrixBase<D>& A) // Symbolic analysis (reusable)
|
||||||
|
GpuSparseLLT& factorize(const SparseMatrixBase<D>& A) // Numeric factorization
|
||||||
|
GpuSparseLLT& compute(const SparseMatrixBase<D>& A) // analyzePattern + factorize
|
||||||
|
void setOrdering(GpuSparseOrdering ord) // AMD (default), METIS, or RCM
|
||||||
|
|
||||||
|
DenseMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs)
|
||||||
|
|
||||||
|
ComputationInfo info() // Lazy sync
|
||||||
|
Index rows() / cols()
|
||||||
|
cudaStream_t stream()
|
||||||
|
```
|
||||||
|
|
||||||
|
### `GpuSparseLDLT<Scalar, UpLo>` -- Sparse LDL^T (cuDSS)
|
||||||
|
|
||||||
|
Symmetric indefinite. Same API as `GpuSparseLLT`.
|
||||||
|
|
||||||
|
### `GpuSparseLU<Scalar>` -- Sparse LU (cuDSS)
|
||||||
|
|
||||||
|
General non-symmetric. Same API as `GpuSparseLLT` (without `UpLo`).
|
||||||
|
|
||||||
|
### `GpuFFT<Scalar>` -- FFT (cuFFT)
|
||||||
|
|
||||||
|
Plans cached by (size, type) and reused. Inverse transforms scaled so
|
||||||
|
`inv(fwd(x)) == x`. Supported scalars: `float`, `double`.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
// 1D transforms (host vectors in and out)
|
||||||
|
ComplexVector fwd(const MatrixBase<D>& x) // C2C forward (complex input)
|
||||||
|
ComplexVector fwd(const MatrixBase<D>& x) // R2C forward (real input, returns n/2+1)
|
||||||
|
ComplexVector inv(const MatrixBase<D>& X) // C2C inverse, scaled by 1/n
|
||||||
|
RealVector invReal(const MatrixBase<D>& X, Index n) // C2R inverse, scaled by 1/n
|
||||||
|
|
||||||
|
// 2D transforms (host matrices in and out)
|
||||||
|
ComplexMatrix fwd2d(const MatrixBase<D>& A) // 2D C2C forward
|
||||||
|
ComplexMatrix inv2d(const MatrixBase<D>& A) // 2D C2C inverse, scaled by 1/(rows*cols)
|
||||||
|
|
||||||
|
cudaStream_t stream()
|
||||||
|
```
|
||||||
|
|
||||||
|
All FFT methods accept host data and return host data. Upload/download is
|
||||||
|
handled internally. The C2C and R2C overloads of `fwd()` are distinguished by
|
||||||
|
the input scalar type (complex vs real).
|
||||||
|
|
||||||
|
### `GpuSparseContext<Scalar>` -- SpMV/SpMM (cuSPARSE)
|
||||||
|
|
||||||
|
Accepts `SparseMatrix<Scalar, ColMajor>`.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
GpuSparseContext() // Creates own stream + cuSPARSE handle
|
||||||
|
GpuSparseContext(GpuContext& ctx) // Borrow GpuContext for same-stream execution
|
||||||
|
|
||||||
|
// Host data in/out
|
||||||
|
DenseVector multiply(A, x) // y = A * x
|
||||||
|
void multiply(A, x, y, alpha=1, beta=0, // y = alpha*op(A)*x + beta*y
|
||||||
|
op=CUSPARSE_OPERATION_NON_TRANSPOSE)
|
||||||
|
DenseVector multiplyT(A, x) // y = A^T * x
|
||||||
|
DenseMatrix multiplyMat(A, X) // Y = A * X (SpMM)
|
||||||
|
|
||||||
|
// DeviceMatrix in/out (sparse matrix re-uploaded each call)
|
||||||
|
void multiply(A, d_x, d_y) // SpMV with device vectors
|
||||||
|
void multiply(A, d_x, d_y, alpha, beta, op)
|
||||||
|
|
||||||
|
// Device-resident sparse matrix (upload once, reuse)
|
||||||
|
DeviceSparseView deviceView(A) // Upload sparse matrix, return view
|
||||||
|
|
||||||
|
cudaStream_t stream()
|
||||||
|
```
|
||||||
|
|
||||||
|
### `DeviceSparseView<Scalar>` -- Device-resident sparse matrix
|
||||||
|
|
||||||
|
Returned by `GpuSparseContext::deviceView()`. Holds a sparse matrix on device
|
||||||
|
for repeated SpMV without re-uploading.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
SpMVExpr operator*(const DeviceMatrix& d_x) // d_y = view * d_x (evaluated on assignment)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Aliasing
|
||||||
|
|
||||||
|
Unlike Eigen's `Matrix`, where omitting `.noalias()` triggers a copy to a
|
||||||
|
temporary, DeviceMatrix dispatches directly to NVIDIA library calls which have
|
||||||
|
no built-in aliasing protection. All operations are implicitly noalias.
|
||||||
|
The caller must ensure operands don't alias the destination for GEMM and TRSM
|
||||||
|
(debug asserts catch violations). `geam` expressions (`d_C = d_A + alpha * d_B`)
|
||||||
|
are safe with aliasing. The `.noalias()` method exists as a no-op for Eigen
|
||||||
|
template compatibility.
|
||||||
|
|
||||||
|
## File layout
|
||||||
|
|
||||||
|
| File | Depends on | Contents |
|
||||||
|
|------|-----------|----------|
|
||||||
|
| `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `cuda_data_type<>` |
|
||||||
|
| `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` |
|
||||||
|
| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM and geam expression wrappers |
|
||||||
|
| `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers |
|
||||||
|
| `DeviceSolverExpr.h` | `DeviceMatrix.h` | Solver expression wrappers (LLT, LU) |
|
||||||
|
| `DeviceScalar.h` | `GpuSupport.h`, `DeviceScalarOps.h` | `DeviceScalar<>` (device-resident scalar) |
|
||||||
|
| `DeviceScalarOps.h` | `<npps_*.h>` | Scalar div/neg/cwiseProduct via NPP |
|
||||||
|
| `DeviceDispatch.h` | all above | All dispatch functions + `DeviceAssignment` |
|
||||||
|
| `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `GpuContext` |
|
||||||
|
| `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>`, `<cublasLt.h>` | cuBLAS/cuBLASLt error macro, type maps |
|
||||||
|
| `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
|
||||||
|
| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
|
||||||
|
| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
|
||||||
|
| `GpuQR.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense QR decomposition |
|
||||||
|
| `GpuSVD.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Dense SVD decomposition |
|
||||||
|
| `GpuEigenSolver.h` | `CuSolverSupport.h` | Self-adjoint eigenvalue decomposition |
|
||||||
|
| `CuFftSupport.h` | `GpuSupport.h`, `<cufft.h>` | cuFFT error macro, type-dispatch wrappers |
|
||||||
|
| `GpuFFT.h` | `CuFftSupport.h`, `CuBlasSupport.h` | 1D/2D FFT with plan caching |
|
||||||
|
| `CuSparseSupport.h` | `GpuSupport.h`, `<cusparse.h>` | cuSPARSE error macro |
|
||||||
|
| `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE, `DeviceSparseView` |
|
||||||
|
| `CuDssSupport.h` | `GpuSupport.h`, `<cudss.h>` | cuDSS error macro, type traits (optional) |
|
||||||
|
| `GpuSparseSolverBase.h` | `CuDssSupport.h` | CRTP base for sparse solvers (optional) |
|
||||||
|
| `GpuSparseLLT.h` | `GpuSparseSolverBase.h` | Sparse Cholesky via cuDSS (optional) |
|
||||||
|
| `GpuSparseLDLT.h` | `GpuSparseSolverBase.h` | Sparse LDL^T via cuDSS (optional) |
|
||||||
|
| `GpuSparseLU.h` | `GpuSparseSolverBase.h` | Sparse LU via cuDSS (optional) |
|
||||||
|
|
||||||
|
## Building and testing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -G Ninja -B build -S . \
|
||||||
|
-DEIGEN_TEST_CUDA=ON \
|
||||||
|
-DEIGEN_CUDA_COMPUTE_ARCH="70" \
|
||||||
|
-DEIGEN_TEST_CUBLAS=ON \
|
||||||
|
-DEIGEN_TEST_CUSOLVER=ON
|
||||||
|
|
||||||
|
cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu \
|
||||||
|
gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen \
|
||||||
|
gpu_device_matrix gpu_cufft gpu_cusparse_spmv gpu_cg
|
||||||
|
ctest --test-dir build -R "gpu_" --output-on-failure
|
||||||
|
|
||||||
|
# Sparse solvers (cuDSS -- separate install required)
|
||||||
|
cmake -G Ninja -B build -S . \
|
||||||
|
-DEIGEN_TEST_CUDA=ON \
|
||||||
|
-DEIGEN_CUDA_COMPUTE_ARCH="70" \
|
||||||
|
-DEIGEN_TEST_CUDSS=ON
|
||||||
|
|
||||||
|
cmake --build build --target gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu
|
||||||
|
ctest --test-dir build -R gpu_cudss --output-on-failure
|
||||||
|
```
|
||||||
|
|
||||||
|
## Future work
|
||||||
|
|
||||||
|
- **Device-side accessors for decomposition results.** `GpuSVD`,
|
||||||
|
`GpuSelfAdjointEigenSolver`, and `GpuQR` currently download decomposition
|
||||||
|
results to host on access (e.g., `svd.matrixU()` returns a host `MatrixXd`).
|
||||||
|
Device-side accessors returning `DeviceMatrix` views of the internal buffers
|
||||||
|
would allow chaining GPU operations (e.g., `svd.deviceU() * d_A`) without
|
||||||
|
round-tripping through host memory.
|
||||||
|
- **Batched API (`DeviceBatchMatrix`).** A strided batch of N identical-size
|
||||||
|
matrices dispatching to cuBLAS/cuSOLVER batched APIs (`cublasDgemmBatched`,
|
||||||
|
`cusolverDnXpotrfBatched`, etc.). This enables robotics and model-predictive
|
||||||
|
control workloads where many small independent systems are solved in
|
||||||
|
parallel.
|
||||||
|
- **cuTENSOR for Tensor module.** Replace the hand-written GPU tensor
|
||||||
|
contraction and reduction kernels (~2300 lines in
|
||||||
|
`TensorContractionGpu.h` / `TensorReductionGpu.h`) with cuTENSOR dispatch,
|
||||||
|
following the same library-dispatch pattern used by `Eigen/GPU`.
|
||||||
|
- **Unified/zero-copy memory for Jetson.** Use `cudaMallocManaged` or
|
||||||
|
`cudaHostAllocMapped` to eliminate `fromHost()` / `toHost()` copies on
|
||||||
|
integrated GPUs (Jetson) where CPU and GPU share DRAM.
|
||||||
|
- **Device-side Eigen interop.** Bridge between host-side `DeviceMatrix`
|
||||||
|
dispatch and device-side Eigen expression templates (Core + Tensor) running
|
||||||
|
inside CUDA kernels. Raw-pointer + `Map` / `TensorMap` as the zero-copy
|
||||||
|
interop surface.
|
||||||
@@ -31,7 +31,10 @@ EIGEN_DONT_INLINE void conjugate_gradient(const MatrixType& mat, const Rhs& rhs,
|
|||||||
Index& iters, typename Dest::RealScalar& tol_error) {
|
Index& iters, typename Dest::RealScalar& tol_error) {
|
||||||
typedef typename Dest::RealScalar RealScalar;
|
typedef typename Dest::RealScalar RealScalar;
|
||||||
typedef typename Dest::Scalar Scalar;
|
typedef typename Dest::Scalar Scalar;
|
||||||
typedef Matrix<Scalar, Dynamic, 1> VectorType;
|
// Use Dest's plain (owning) type as VectorType. For CPU Matrix/Map this
|
||||||
|
// resolves to Matrix<Scalar,Dynamic,1>. For GPU DeviceMatrix, PlainObject
|
||||||
|
// is DeviceMatrix itself (already owning).
|
||||||
|
typedef typename Dest::PlainObject VectorType;
|
||||||
|
|
||||||
RealScalar tol = tol_error;
|
RealScalar tol = tol_error;
|
||||||
Index maxIters = iters;
|
Index maxIters = iters;
|
||||||
|
|||||||
@@ -43,3 +43,10 @@ add_subdirectory(Householder)
|
|||||||
add_subdirectory(Solvers)
|
add_subdirectory(Solvers)
|
||||||
add_subdirectory(Tuning)
|
add_subdirectory(Tuning)
|
||||||
add_subdirectory(BLAS)
|
add_subdirectory(BLAS)
|
||||||
|
|
||||||
|
# GPU benchmarks have their own CMake project (needs CUDAToolkit).
|
||||||
|
# They can also be built standalone: cmake -B build -S benchmarks/GPU
|
||||||
|
find_package(CUDAToolkit QUIET)
|
||||||
|
if(CUDAToolkit_FOUND)
|
||||||
|
add_subdirectory(GPU)
|
||||||
|
endif()
|
||||||
|
|||||||
91
benchmarks/GPU/CMakeLists.txt
Normal file
91
benchmarks/GPU/CMakeLists.txt
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
# GPU benchmarks require CUDA runtime + cuSOLVER.
|
||||||
|
# Build separately from the main benchmark tree since they need CUDA toolchain.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# cmake -G Ninja -B build-bench-gpu -S benchmarks/GPU \
|
||||||
|
# -DCMAKE_CUDA_ARCHITECTURES=89
|
||||||
|
# cmake --build build-bench-gpu
|
||||||
|
#
|
||||||
|
# Profiling:
|
||||||
|
# nsys profile --trace=cuda ./build-bench-gpu/bench_gpu_solvers
|
||||||
|
# ncu --set full -o profile ./build-bench-gpu/bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
|
||||||
|
|
||||||
|
cmake_minimum_required(VERSION 3.18)
|
||||||
|
project(EigenGpuBenchmarks CXX CUDA)
|
||||||
|
|
||||||
|
find_package(benchmark REQUIRED)
|
||||||
|
find_package(CUDAToolkit REQUIRED)
|
||||||
|
|
||||||
|
set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
|
||||||
|
|
||||||
|
function(eigen_add_gpu_benchmark name source)
|
||||||
|
cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN})
|
||||||
|
if(NOT IS_ABSOLUTE "${source}")
|
||||||
|
set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}")
|
||||||
|
endif()
|
||||||
|
add_executable(${name} ${source})
|
||||||
|
target_include_directories(${name} PRIVATE
|
||||||
|
${EIGEN_SOURCE_DIR}
|
||||||
|
${CUDAToolkit_INCLUDE_DIRS})
|
||||||
|
target_link_libraries(${name} PRIVATE
|
||||||
|
benchmark::benchmark benchmark::benchmark_main
|
||||||
|
CUDA::cudart CUDA::cusolver CUDA::cublas)
|
||||||
|
if(BENCH_LIBRARIES)
|
||||||
|
target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
|
||||||
|
endif()
|
||||||
|
target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
|
||||||
|
target_compile_definitions(${name} PRIVATE EIGEN_USE_GPU)
|
||||||
|
if(BENCH_DEFINITIONS)
|
||||||
|
target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
|
||||||
|
endif()
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
# Solver benchmarks: LLT/LU compute + solve, host vs device paths, CPU baselines.
|
||||||
|
eigen_add_gpu_benchmark(bench_gpu_solvers bench_gpu_solvers.cpp)
|
||||||
|
eigen_add_gpu_benchmark(bench_gpu_solvers_float bench_gpu_solvers.cpp DEFINITIONS SCALAR=float)
|
||||||
|
|
||||||
|
# Chaining benchmarks: async pipeline efficiency, host-roundtrip vs device chain.
|
||||||
|
eigen_add_gpu_benchmark(bench_gpu_chaining bench_gpu_chaining.cpp)
|
||||||
|
eigen_add_gpu_benchmark(bench_gpu_chaining_float bench_gpu_chaining.cpp DEFINITIONS SCALAR=float)
|
||||||
|
|
||||||
|
# Batching benchmarks: multi-stream concurrency for many small systems.
|
||||||
|
eigen_add_gpu_benchmark(bench_gpu_batching bench_gpu_batching.cpp)
|
||||||
|
eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITIONS SCALAR=float)
|
||||||
|
|
||||||
|
# FFT benchmarks: 1D/2D C2C, R2C, C2R throughput and plan reuse.
|
||||||
|
eigen_add_gpu_benchmark(bench_gpu_fft bench_gpu_fft.cpp LIBRARIES CUDA::cufft)
|
||||||
|
eigen_add_gpu_benchmark(bench_gpu_fft_double bench_gpu_fft.cpp LIBRARIES CUDA::cufft DEFINITIONS SCALAR=double)
|
||||||
|
|
||||||
|
# CG sync overhead benchmark: host vs device pointer mode for reductions.
|
||||||
|
# Uses CUDA kernels for device scalar arithmetic.
|
||||||
|
add_executable(bench_gpu_cg_sync bench_gpu_cg_sync.cu)
|
||||||
|
target_include_directories(bench_gpu_cg_sync PRIVATE
|
||||||
|
${EIGEN_SOURCE_DIR}
|
||||||
|
${CUDAToolkit_INCLUDE_DIRS})
|
||||||
|
target_link_libraries(bench_gpu_cg_sync PRIVATE
|
||||||
|
benchmark::benchmark benchmark::benchmark_main
|
||||||
|
CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
|
||||||
|
target_compile_options(bench_gpu_cg_sync PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
|
||||||
|
target_compile_definitions(bench_gpu_cg_sync PRIVATE EIGEN_USE_GPU)
|
||||||
|
|
||||||
|
# GPU CG vs CPU CG comparison benchmark.
|
||||||
|
add_executable(bench_gpu_cg_vs_cpu bench_gpu_cg_vs_cpu.cu)
|
||||||
|
target_include_directories(bench_gpu_cg_vs_cpu PRIVATE
|
||||||
|
${EIGEN_SOURCE_DIR}
|
||||||
|
${CUDAToolkit_INCLUDE_DIRS})
|
||||||
|
target_link_libraries(bench_gpu_cg_vs_cpu PRIVATE
|
||||||
|
benchmark::benchmark benchmark::benchmark_main
|
||||||
|
CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
|
||||||
|
target_compile_options(bench_gpu_cg_vs_cpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
|
||||||
|
target_compile_definitions(bench_gpu_cg_vs_cpu PRIVATE EIGEN_USE_GPU)
|
||||||
|
|
||||||
|
# Bundle Adjustment benchmark: GPU CG vs CPU CG on real BAL datasets.
|
||||||
|
add_executable(bench_gpu_ba bench_gpu_ba.cu)
|
||||||
|
target_include_directories(bench_gpu_ba PRIVATE
|
||||||
|
${EIGEN_SOURCE_DIR}
|
||||||
|
${CUDAToolkit_INCLUDE_DIRS})
|
||||||
|
target_link_libraries(bench_gpu_ba PRIVATE
|
||||||
|
benchmark::benchmark
|
||||||
|
CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
|
||||||
|
target_compile_options(bench_gpu_ba PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
|
||||||
|
target_compile_definitions(bench_gpu_ba PRIVATE EIGEN_USE_GPU)
|
||||||
149
benchmarks/GPU/ba_results.md
Normal file
149
benchmarks/GPU/ba_results.md
Normal file
@@ -0,0 +1,149 @@
|
|||||||
|
# Bundle Adjustment: GPU CG vs CPU CG Results
|
||||||
|
|
||||||
|
Benchmark of Eigen's GPU CG pipeline on normal equations arising from bundle
|
||||||
|
adjustment (BAL datasets). Compares CPU `ConjugateGradient` (Jacobi preconditioner)
|
||||||
|
against GPU CG using `DeviceMatrix` + `GpuSparseContext` + `DeviceScalar`.
|
||||||
|
|
||||||
|
## Hardware
|
||||||
|
|
||||||
|
- **CPU**: Intel Core i7-13700HX (Raptor Lake, 12 cores / 24 threads, single thread for Eigen CG)
|
||||||
|
- **GPU**: NVIDIA GeForce RTX 4070 Laptop GPU (Ada Lovelace, 4608 CUDA cores, 8 GB GDDR6)
|
||||||
|
- **CUDA**: 13.2 / Driver 595.79
|
||||||
|
- **OS**: Ubuntu 24.04 (WSL2, kernel 6.6.87)
|
||||||
|
|
||||||
|
## Software
|
||||||
|
|
||||||
|
- Eigen: `eigen-gpu-cg` branch
|
||||||
|
- Google Benchmark 1.9.1
|
||||||
|
- Compiler: nvcc 13.2 + g++ 13.3
|
||||||
|
- Normal equations: H = J^T*J + I (Levenberg-Marquardt damping lambda=1.0)
|
||||||
|
- CG tolerance: 1e-8, max iterations: 10000
|
||||||
|
|
||||||
|
## Method
|
||||||
|
|
||||||
|
For each BAL problem file:
|
||||||
|
1. Parse the BAL file (cameras, 3D points, 2D observations)
|
||||||
|
2. Compute the full Jacobian J using the BAL camera model (Rodrigues rotation +
|
||||||
|
perspective projection + radial distortion) with central finite differences
|
||||||
|
3. Form the normal equations H = J^T*J + lambda*I (sparse, symmetric positive definite)
|
||||||
|
4. Solve H*dx = -J^T*r using CG with Jacobi preconditioner on CPU and GPU
|
||||||
|
5. Report wall-clock time (mean of 3 repetitions)
|
||||||
|
|
||||||
|
GPU CG uses: `GpuSparseContext` for SpMV, `DeviceMatrix` for vectors,
|
||||||
|
`DeviceScalar` with `CUBLAS_POINTER_MODE_DEVICE` for dot/norm reductions,
|
||||||
|
in-place `cwiseProduct` via NPP for Jacobi preconditioner application,
|
||||||
|
device-pointer-mode `scal` to avoid host sync on the beta update.
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
### Summary table
|
||||||
|
|
||||||
|
| Dataset | Cameras | Points | Obs | H size | H nnz | CG iters | CPU CG (ms) | GPU CG (ms) | Speedup |
|
||||||
|
|---------|---------|--------|-----|--------|-------|----------|-------------|-------------|---------|
|
||||||
|
| Ladybug-49 | 49 | 7,776 | 31,843 | 23,769 | 1.8M | 4,421 | 4,006 | 1,152 | **3.5x** |
|
||||||
|
| Ladybug-138 | 138 | 19,878 | 85,217 | 60,876 | 4.8M | 7,008 | 21,498 | 3,553 | **6.1x** |
|
||||||
|
| Ladybug-646 | 646 | 73,584 | 327,297 | 226,566 | 18.4M | 10,000* | 123,727 | 14,268 | **8.7x** |
|
||||||
|
| Dubrovnik-356 | 356 | 226,730 | 1,255,268 | 683,394 | 69.8M | 4,308 | 216,149 | 24,493 | **8.8x** |
|
||||||
|
|
||||||
|
\* Hit 10,000 iteration cap (poorly conditioned problem). Both CPU and GPU
|
||||||
|
hit the same cap, so timing comparison remains valid.
|
||||||
|
|
||||||
|
### Profile breakdown (Ladybug-138, nsys)
|
||||||
|
|
||||||
|
GPU kernel time is dominated by SpMV (91%). The remaining 9% is BLAS-1
|
||||||
|
operations (dot, axpy, scal) and NPP element-wise ops (cwiseProduct).
|
||||||
|
|
||||||
|
| Kernel | Time (ms) | % | Calls |
|
||||||
|
|--------|-----------|---|-------|
|
||||||
|
| cuSPARSE csrmv (SpMV) | 2507 | 91.3% | 7,006 |
|
||||||
|
| cuBLAS dot | 92 | 3.4% | 21,020 |
|
||||||
|
| cuBLAS axpy (device ptr) | 27 | 1.0% | 14,012 |
|
||||||
|
| cuSPARSE partition | 19 | 0.7% | 7,006 |
|
||||||
|
| NPP cwiseProduct | 16 + 13 | 1.1% | 14,011 + 7,006 |
|
||||||
|
| cuBLAS axpy (host ptr) | 12 | 0.5% | 7,005 |
|
||||||
|
| cuBLAS scal (device ptr) | 11 | 0.4% | 7,005 |
|
||||||
|
| NPP scalar ops | 7 | 0.2% | 7,006 |
|
||||||
|
|
||||||
|
### Optimizations applied
|
||||||
|
|
||||||
|
Three profiling-driven optimizations reduced GPU CG time by **1.8x**
|
||||||
|
(6.5s → 3.6s on Ladybug-138):
|
||||||
|
|
||||||
|
1. **In-place `cwiseProduct`**: The Jacobi preconditioner apply
|
||||||
|
(`z = invdiag .* residual`) was allocating a new DeviceMatrix every
|
||||||
|
iteration. Added `z.cwiseProduct(ctx, a, b)` that reuses `z`'s buffer.
|
||||||
|
Reduced `cudaMalloc` calls from 7,053 to 23 (saving 2.3s).
|
||||||
|
|
||||||
|
2. **`squaredNorm` via `dot(x,x)`**: cuBLAS `nrm2` uses a numerically
|
||||||
|
careful scaled-sum-of-squares algorithm (29µs/call). Replaced with
|
||||||
|
`dot(x,x)` (6.4µs/call) — 4.5x faster per call, saving ~320ms.
|
||||||
|
|
||||||
|
3. **Device-pointer `scal`**: `p *= beta` was converting `DeviceScalar`
|
||||||
|
beta to host (triggering a stream sync), then calling host-pointer-mode
|
||||||
|
scal. Added `operator*=(DeviceScalar)` that uses device-pointer-mode
|
||||||
|
scal, eliminating one sync per iteration. Halved `cudaStreamSynchronize`
|
||||||
|
calls from 14K to 7K.
|
||||||
|
|
||||||
|
### Observations
|
||||||
|
|
||||||
|
1. **GPU speedup scales with problem size**: from 3.5x on small problems
|
||||||
|
(24K variables) to 8.8x on large problems (683K variables). This is
|
||||||
|
expected — larger problems have more parallelism for the GPU to exploit.
|
||||||
|
|
||||||
|
2. **Iteration counts match**: CPU and GPU CG converge in the same number
|
||||||
|
of iterations (within 1%), confirming numerical equivalence.
|
||||||
|
|
||||||
|
3. **Bottleneck is SpMV**: CG iteration time is dominated (91%) by the
|
||||||
|
sparse matrix-vector product on H. Further speedup requires either
|
||||||
|
faster SpMV (e.g., block-sparse formats) or algorithmic improvements
|
||||||
|
(Schur complement, better preconditioners).
|
||||||
|
|
||||||
|
4. **Remaining overhead**: CUDA API calls (cudaMemcpyAsync for 8-byte
|
||||||
|
DeviceScalar transfers) account for ~50% of non-kernel time. Batching
|
||||||
|
multiple scalar reductions into a single transfer would help.
|
||||||
|
|
||||||
|
5. **Jacobi preconditioner is weak for BA**: The Ladybug-646 problem does
|
||||||
|
not converge in 10K iterations. Ceres uses block Jacobi or Schur
|
||||||
|
complement preconditioners that would also benefit from GPU acceleration.
|
||||||
|
|
||||||
|
### Scaling plot data
|
||||||
|
|
||||||
|
```
|
||||||
|
# n nnz_H cpu_ms gpu_ms speedup
|
||||||
|
23769 1793475 4006 1152 3.48
|
||||||
|
60876 4791762 21498 3553 6.05
|
||||||
|
226566 18387948 123727 14268 8.67
|
||||||
|
683394 69827066 216149 24493 8.82
|
||||||
|
```
|
||||||
|
|
||||||
|
## BAL datasets
|
||||||
|
|
||||||
|
Downloaded from http://grail.cs.washington.edu/projects/bal/
|
||||||
|
|
||||||
|
| File | Source |
|
||||||
|
|------|--------|
|
||||||
|
| problem-49-7776-pre.txt | Ladybug sequence |
|
||||||
|
| problem-138-19878-pre.txt | Ladybug sequence |
|
||||||
|
| problem-646-73584-pre.txt | Ladybug sequence |
|
||||||
|
| problem-356-226730-pre.txt | Dubrovnik reconstruction |
|
||||||
|
|
||||||
|
## Reproducing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build
|
||||||
|
cmake -G Ninja -B build-bench-gpu -S benchmarks/GPU -DCMAKE_CUDA_ARCHITECTURES=89
|
||||||
|
cmake --build build-bench-gpu --target bench_gpu_ba
|
||||||
|
|
||||||
|
# Download BAL datasets
|
||||||
|
wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-49-7776-pre.txt.bz2
|
||||||
|
wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-138-19878-pre.txt.bz2
|
||||||
|
wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-646-73584-pre.txt.bz2
|
||||||
|
wget http://grail.cs.washington.edu/projects/bal/data/dubrovnik/problem-356-226730-pre.txt.bz2
|
||||||
|
bunzip2 *.bz2
|
||||||
|
|
||||||
|
# Run (one at a time)
|
||||||
|
BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
|
||||||
|
BAL_FILE=problem-138-19878-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
|
||||||
|
BAL_FILE=problem-646-73584-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
|
||||||
|
BAL_FILE=problem-356-226730-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
|
||||||
|
```
|
||||||
533
benchmarks/GPU/bench_gpu_ba.cu
Normal file
533
benchmarks/GPU/bench_gpu_ba.cu
Normal file
@@ -0,0 +1,533 @@
|
|||||||
|
// Bundle Adjustment benchmark: GPU CG vs CPU CG on real BAL datasets.
|
||||||
|
//
|
||||||
|
// Tests Eigen's GPU CG pipeline (DeviceMatrix + GpuSparseContext + DeviceScalar)
|
||||||
|
// on the normal equations (J^T*J) arising from bundle adjustment problems.
|
||||||
|
//
|
||||||
|
// Reads a BAL (Bundle Adjustment in the Large) format file, computes the
|
||||||
|
// Jacobian and residual, forms the normal equations H = J^T*J + lambda*I,
|
||||||
|
// then solves H*dx = -J^T*r with both CPU and GPU conjugate gradients.
|
||||||
|
//
|
||||||
|
// BAL format: http://grail.cs.washington.edu/projects/bal/
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// cmake --build build-bench-gpu --target bench_gpu_ba
|
||||||
|
//
|
||||||
|
// # Download a BAL dataset (bz2-compressed):
|
||||||
|
// wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-49-7776-pre.txt.bz2
|
||||||
|
// bunzip2 problem-49-7776-pre.txt.bz2
|
||||||
|
//
|
||||||
|
// # Run on a specific problem:
|
||||||
|
// BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba
|
||||||
|
//
|
||||||
|
// # Append results to the log:
|
||||||
|
// BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba \
|
||||||
|
// --benchmark_format=console 2>&1 | tee -a benchmarks/GPU/ba_results.log
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/IterativeLinearSolvers>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// BAL problem data
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
struct BALProblem {
|
||||||
|
int num_cameras = 0;
|
||||||
|
int num_points = 0;
|
||||||
|
int num_observations = 0;
|
||||||
|
|
||||||
|
// Observations: (camera_idx, point_idx, observed_x, observed_y).
|
||||||
|
std::vector<int> camera_index;
|
||||||
|
std::vector<int> point_index;
|
||||||
|
std::vector<double> observations_x;
|
||||||
|
std::vector<double> observations_y;
|
||||||
|
|
||||||
|
// Camera parameters: 9 per camera (Rodrigues r[3], translation t[3], f, k1, k2).
|
||||||
|
std::vector<double> cameras; // [num_cameras * 9]
|
||||||
|
|
||||||
|
// 3D points: 3 per point.
|
||||||
|
std::vector<double> points; // [num_points * 3]
|
||||||
|
|
||||||
|
const double* camera(int i) const { return &cameras[i * 9]; }
|
||||||
|
const double* point(int i) const { return &points[i * 3]; }
|
||||||
|
|
||||||
|
bool load(const std::string& filename) {
|
||||||
|
std::ifstream in(filename);
|
||||||
|
if (!in) {
|
||||||
|
fprintf(stderr, "ERROR: Cannot open BAL file: %s\n", filename.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
in >> num_cameras >> num_points >> num_observations;
|
||||||
|
if (!in || num_cameras <= 0 || num_points <= 0 || num_observations <= 0) {
|
||||||
|
fprintf(stderr, "ERROR: Invalid BAL header in %s\n", filename.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
camera_index.resize(num_observations);
|
||||||
|
point_index.resize(num_observations);
|
||||||
|
observations_x.resize(num_observations);
|
||||||
|
observations_y.resize(num_observations);
|
||||||
|
for (int i = 0; i < num_observations; ++i) {
|
||||||
|
in >> camera_index[i] >> point_index[i] >> observations_x[i] >> observations_y[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
cameras.resize(num_cameras * 9);
|
||||||
|
for (int i = 0; i < num_cameras * 9; ++i) {
|
||||||
|
in >> cameras[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
points.resize(num_points * 3);
|
||||||
|
for (int i = 0; i < num_points * 3; ++i) {
|
||||||
|
in >> points[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!in) {
|
||||||
|
fprintf(stderr, "ERROR: Truncated BAL file: %s\n", filename.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "Loaded BAL: %d cameras, %d points, %d observations\n", num_cameras, num_points, num_observations);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Camera projection model (BAL convention)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// Rodrigues rotation: rotate point X by axis-angle vector omega.
|
||||||
|
static void rodrigues_rotate(const double* omega, const double* X, double* result) {
|
||||||
|
double theta2 = omega[0] * omega[0] + omega[1] * omega[1] + omega[2] * omega[2];
|
||||||
|
if (theta2 > 1e-30) {
|
||||||
|
double theta = std::sqrt(theta2);
|
||||||
|
double costh = std::cos(theta);
|
||||||
|
double sinth = std::sin(theta);
|
||||||
|
double k = (1.0 - costh) / theta2;
|
||||||
|
|
||||||
|
// Cross product omega x X.
|
||||||
|
double wx = omega[1] * X[2] - omega[2] * X[1];
|
||||||
|
double wy = omega[2] * X[0] - omega[0] * X[2];
|
||||||
|
double wz = omega[0] * X[1] - omega[1] * X[0];
|
||||||
|
|
||||||
|
// Dot product omega . X.
|
||||||
|
double dot = omega[0] * X[0] + omega[1] * X[1] + omega[2] * X[2];
|
||||||
|
|
||||||
|
result[0] = X[0] * costh + wx * (sinth / theta) + omega[0] * dot * k;
|
||||||
|
result[1] = X[1] * costh + wy * (sinth / theta) + omega[1] * dot * k;
|
||||||
|
result[2] = X[2] * costh + wz * (sinth / theta) + omega[2] * dot * k;
|
||||||
|
} else {
|
||||||
|
// Small angle: R ≈ I + [omega]×.
|
||||||
|
result[0] = X[0] + omega[1] * X[2] - omega[2] * X[1];
|
||||||
|
result[1] = X[1] + omega[2] * X[0] - omega[0] * X[2];
|
||||||
|
result[2] = X[2] + omega[0] * X[1] - omega[1] * X[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Project a 3D point through a camera, returning the 2D residual.
|
||||||
|
// camera: [r0,r1,r2, t0,t1,t2, f, k1, k2]
|
||||||
|
// point: [X, Y, Z]
|
||||||
|
// observed: [ox, oy]
|
||||||
|
// residual: [rx, ry] = projected - observed
|
||||||
|
static void project(const double* camera, const double* point, const double* observed, double* residual) {
|
||||||
|
// Rotate.
|
||||||
|
double P[3];
|
||||||
|
rodrigues_rotate(camera, point, P);
|
||||||
|
|
||||||
|
// Translate.
|
||||||
|
P[0] += camera[3];
|
||||||
|
P[1] += camera[4];
|
||||||
|
P[2] += camera[5];
|
||||||
|
|
||||||
|
// Normalize (BAL convention: negative z).
|
||||||
|
double xp = -P[0] / P[2];
|
||||||
|
double yp = -P[1] / P[2];
|
||||||
|
|
||||||
|
// Radial distortion.
|
||||||
|
double r2 = xp * xp + yp * yp;
|
||||||
|
double distortion = 1.0 + camera[7] * r2 + camera[8] * r2 * r2;
|
||||||
|
|
||||||
|
// Apply focal length.
|
||||||
|
double predicted_x = camera[6] * distortion * xp;
|
||||||
|
double predicted_y = camera[6] * distortion * yp;
|
||||||
|
|
||||||
|
residual[0] = predicted_x - observed[0];
|
||||||
|
residual[1] = predicted_y - observed[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Jacobian computation (numerical differentiation)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// Compute the 2x9 Jacobian block w.r.t. camera params and 2x3 block w.r.t.
|
||||||
|
// point coords for a single observation, using central finite differences.
|
||||||
|
static void compute_jacobian_block(const double* camera, const double* point, const double* observed,
|
||||||
|
double* J_cam, // 2x9, row-major
|
||||||
|
double* J_point) // 2x3, row-major
|
||||||
|
{
|
||||||
|
constexpr double eps = 1e-8;
|
||||||
|
|
||||||
|
// Camera parameters (9).
|
||||||
|
double cam_pert[9];
|
||||||
|
std::copy(camera, camera + 9, cam_pert);
|
||||||
|
for (int j = 0; j < 9; ++j) {
|
||||||
|
double orig = cam_pert[j];
|
||||||
|
double rp[2], rm[2];
|
||||||
|
|
||||||
|
cam_pert[j] = orig + eps;
|
||||||
|
project(cam_pert, point, observed, rp);
|
||||||
|
cam_pert[j] = orig - eps;
|
||||||
|
project(cam_pert, point, observed, rm);
|
||||||
|
cam_pert[j] = orig;
|
||||||
|
|
||||||
|
J_cam[0 * 9 + j] = (rp[0] - rm[0]) / (2.0 * eps);
|
||||||
|
J_cam[1 * 9 + j] = (rp[1] - rm[1]) / (2.0 * eps);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Point coordinates (3).
|
||||||
|
double pt_pert[3];
|
||||||
|
std::copy(point, point + 3, pt_pert);
|
||||||
|
for (int j = 0; j < 3; ++j) {
|
||||||
|
double orig = pt_pert[j];
|
||||||
|
double rp[2], rm[2];
|
||||||
|
|
||||||
|
pt_pert[j] = orig + eps;
|
||||||
|
project(camera, pt_pert, observed, rp);
|
||||||
|
pt_pert[j] = orig - eps;
|
||||||
|
project(camera, pt_pert, observed, rm);
|
||||||
|
pt_pert[j] = orig;
|
||||||
|
|
||||||
|
J_point[0 * 3 + j] = (rp[0] - rm[0]) / (2.0 * eps);
|
||||||
|
J_point[1 * 3 + j] = (rp[1] - rm[1]) / (2.0 * eps);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Build normal equations: H = J^T*J + lambda*I, g = -J^T*r
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
struct NormalEquations {
|
||||||
|
SparseMatrix<double, ColMajor, int> H;
|
||||||
|
VectorXd g;
|
||||||
|
VectorXd residual;
|
||||||
|
double residual_norm;
|
||||||
|
int jacobian_rows;
|
||||||
|
int jacobian_cols;
|
||||||
|
long jacobian_nnz;
|
||||||
|
};
|
||||||
|
|
||||||
|
static NormalEquations build_normal_equations(const BALProblem& problem, double lambda = 1.0) {
|
||||||
|
const int num_cam_params = problem.num_cameras * 9;
|
||||||
|
const int num_pt_params = problem.num_points * 3;
|
||||||
|
const int num_params = num_cam_params + num_pt_params;
|
||||||
|
const int num_residuals = problem.num_observations * 2;
|
||||||
|
|
||||||
|
fprintf(stderr, "Building Jacobian: %d x %d, %ld nonzeros\n", num_residuals, num_params,
|
||||||
|
(long)problem.num_observations * 24);
|
||||||
|
|
||||||
|
// Build J as a triplet list.
|
||||||
|
using Triplet = Eigen::Triplet<double>;
|
||||||
|
std::vector<Triplet> triplets;
|
||||||
|
triplets.reserve(problem.num_observations * 24); // 2 rows × 12 nonzeros = 24 entries per obs
|
||||||
|
|
||||||
|
VectorXd residual(num_residuals);
|
||||||
|
|
||||||
|
for (int obs = 0; obs < problem.num_observations; ++obs) {
|
||||||
|
int ci = problem.camera_index[obs];
|
||||||
|
int pi = problem.point_index[obs];
|
||||||
|
double observed[2] = {problem.observations_x[obs], problem.observations_y[obs]};
|
||||||
|
|
||||||
|
// Compute residual.
|
||||||
|
double r[2];
|
||||||
|
project(problem.camera(ci), problem.point(pi), observed, r);
|
||||||
|
residual[obs * 2 + 0] = r[0];
|
||||||
|
residual[obs * 2 + 1] = r[1];
|
||||||
|
|
||||||
|
// Compute Jacobian blocks.
|
||||||
|
double J_cam[18], J_pt[6]; // 2x9 and 2x3
|
||||||
|
compute_jacobian_block(problem.camera(ci), problem.point(pi), observed, J_cam, J_pt);
|
||||||
|
|
||||||
|
// Insert camera block: rows [2*obs, 2*obs+1], cols [9*ci, 9*ci+8].
|
||||||
|
for (int row = 0; row < 2; ++row) {
|
||||||
|
for (int col = 0; col < 9; ++col) {
|
||||||
|
double val = J_cam[row * 9 + col];
|
||||||
|
if (val != 0.0) {
|
||||||
|
triplets.emplace_back(obs * 2 + row, ci * 9 + col, val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert point block: rows [2*obs, 2*obs+1], cols [num_cam_params + 3*pi, ...].
|
||||||
|
for (int row = 0; row < 2; ++row) {
|
||||||
|
for (int col = 0; col < 3; ++col) {
|
||||||
|
double val = J_pt[row * 3 + col];
|
||||||
|
if (val != 0.0) {
|
||||||
|
triplets.emplace_back(obs * 2 + row, num_cam_params + pi * 3 + col, val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build sparse Jacobian.
|
||||||
|
SparseMatrix<double, ColMajor, int> J(num_residuals, num_params);
|
||||||
|
J.setFromTriplets(triplets.begin(), triplets.end());
|
||||||
|
|
||||||
|
fprintf(stderr, "Jacobian: %dx%d, nnz=%ld\n", (int)J.rows(), (int)J.cols(), (long)J.nonZeros());
|
||||||
|
|
||||||
|
// Form normal equations: H = J^T*J + lambda*I.
|
||||||
|
SparseMatrix<double, ColMajor, int> H = (J.transpose() * J).pruned();
|
||||||
|
|
||||||
|
// Add Levenberg-Marquardt damping.
|
||||||
|
for (int i = 0; i < num_params; ++i) {
|
||||||
|
H.coeffRef(i, i) += lambda;
|
||||||
|
}
|
||||||
|
H.makeCompressed();
|
||||||
|
|
||||||
|
// Gradient: g = -J^T * r.
|
||||||
|
VectorXd g = -(J.transpose() * residual);
|
||||||
|
|
||||||
|
double rnorm = residual.norm();
|
||||||
|
fprintf(stderr, "Normal equations: H is %dx%d, nnz=%ld, |r|=%.6e\n", (int)H.rows(), (int)H.cols(), (long)H.nonZeros(),
|
||||||
|
rnorm);
|
||||||
|
|
||||||
|
return {std::move(H), std::move(g), std::move(residual), rnorm, num_residuals, num_params, (long)J.nonZeros()};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Global problem state (loaded once before benchmarks run)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
static BALProblem g_problem;
|
||||||
|
static NormalEquations g_neq;
|
||||||
|
static bool g_loaded = false;
|
||||||
|
|
||||||
|
static void ensure_loaded() {
|
||||||
|
if (g_loaded) return;
|
||||||
|
|
||||||
|
const char* bal_file = std::getenv("BAL_FILE");
|
||||||
|
if (!bal_file) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"ERROR: Set BAL_FILE environment variable to a BAL problem file.\n"
|
||||||
|
" Download from: http://grail.cs.washington.edu/projects/bal/\n"
|
||||||
|
" Example:\n"
|
||||||
|
" wget http://grail.cs.washington.edu/projects/bal/data/ladybug/"
|
||||||
|
"problem-49-7776-pre.txt.bz2\n"
|
||||||
|
" bunzip2 problem-49-7776-pre.txt.bz2\n"
|
||||||
|
" BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba\n");
|
||||||
|
std::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!g_problem.load(bal_file)) {
|
||||||
|
std::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
g_neq = build_normal_equations(g_problem);
|
||||||
|
g_loaded = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CPU CG benchmark
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
static void BM_BA_CPU_CG(benchmark::State& state) {
|
||||||
|
ensure_loaded();
|
||||||
|
const auto& H = g_neq.H;
|
||||||
|
const auto& g = g_neq.g;
|
||||||
|
|
||||||
|
ConjugateGradient<SparseMatrix<double, ColMajor, int>, Lower | Upper> cg;
|
||||||
|
cg.setMaxIterations(10000);
|
||||||
|
cg.setTolerance(1e-8);
|
||||||
|
cg.compute(H);
|
||||||
|
|
||||||
|
int last_iters = 0;
|
||||||
|
double last_error = 0;
|
||||||
|
for (auto _ : state) {
|
||||||
|
VectorXd dx = cg.solve(g);
|
||||||
|
benchmark::DoNotOptimize(dx.data());
|
||||||
|
last_iters = cg.iterations();
|
||||||
|
last_error = cg.error();
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = H.rows();
|
||||||
|
state.counters["nnz"] = H.nonZeros();
|
||||||
|
state.counters["iters"] = last_iters;
|
||||||
|
state.counters["error"] = last_error;
|
||||||
|
state.counters["cameras"] = g_problem.num_cameras;
|
||||||
|
state.counters["points"] = g_problem.num_points;
|
||||||
|
state.counters["observations"] = g_problem.num_observations;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// GPU CG benchmark (with Jacobi preconditioner)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
static void cuda_warmup() {
|
||||||
|
static bool done = false;
|
||||||
|
if (!done) {
|
||||||
|
void* p;
|
||||||
|
cudaMalloc(&p, 1);
|
||||||
|
cudaFree(p);
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void BM_BA_GPU_CG(benchmark::State& state) {
|
||||||
|
ensure_loaded();
|
||||||
|
cuda_warmup();
|
||||||
|
|
||||||
|
const auto& H = g_neq.H;
|
||||||
|
const auto& g = g_neq.g;
|
||||||
|
const Index n = H.rows();
|
||||||
|
|
||||||
|
// Extract inverse diagonal (Jacobi preconditioner).
|
||||||
|
using SpMat = SparseMatrix<double, ColMajor, int>;
|
||||||
|
VectorXd invdiag(n);
|
||||||
|
for (Index j = 0; j < H.outerSize(); ++j) {
|
||||||
|
SpMat::InnerIterator it(H, j);
|
||||||
|
while (it && it.index() != j) ++it;
|
||||||
|
if (it && it.index() == j && it.value() != 0.0)
|
||||||
|
invdiag(j) = 1.0 / it.value();
|
||||||
|
else
|
||||||
|
invdiag(j) = 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set up GPU context and upload data.
|
||||||
|
GpuContext ctx;
|
||||||
|
GpuContext::setThreadLocal(&ctx);
|
||||||
|
GpuSparseContext<double> spmv_ctx(ctx);
|
||||||
|
auto mat = spmv_ctx.deviceView(H);
|
||||||
|
auto d_invdiag = DeviceMatrix<double>::fromHost(invdiag, ctx.stream());
|
||||||
|
auto d_g = DeviceMatrix<double>::fromHost(g, ctx.stream());
|
||||||
|
|
||||||
|
int last_iters = 0;
|
||||||
|
double last_error = 0;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
DeviceMatrix<double> d_x(n, 1);
|
||||||
|
d_x.setZero(ctx);
|
||||||
|
DeviceMatrix<double> residual(n, 1);
|
||||||
|
residual.copyFrom(ctx, d_g);
|
||||||
|
|
||||||
|
double rhsNorm2 = d_g.squaredNorm(ctx);
|
||||||
|
double threshold = 1e-8 * 1e-8 * rhsNorm2;
|
||||||
|
double residualNorm2 = residual.squaredNorm(ctx);
|
||||||
|
|
||||||
|
DeviceMatrix<double> p = d_invdiag.cwiseProduct(ctx, residual);
|
||||||
|
DeviceMatrix<double> z(n, 1), tmp(n, 1);
|
||||||
|
|
||||||
|
auto absNew = residual.dot(ctx, p);
|
||||||
|
Index i = 0;
|
||||||
|
Index maxIters = 10000;
|
||||||
|
while (i < maxIters) {
|
||||||
|
tmp.noalias() = mat * p;
|
||||||
|
auto alpha = absNew / p.dot(ctx, tmp);
|
||||||
|
d_x += alpha * p;
|
||||||
|
residual -= alpha * tmp;
|
||||||
|
|
||||||
|
residualNorm2 = residual.squaredNorm(ctx);
|
||||||
|
if (residualNorm2 < threshold) break;
|
||||||
|
|
||||||
|
z.cwiseProduct(ctx, d_invdiag, residual); // in-place, no allocation
|
||||||
|
auto absOld = std::move(absNew);
|
||||||
|
absNew = residual.dot(ctx, z);
|
||||||
|
auto beta = absNew / absOld;
|
||||||
|
|
||||||
|
p *= beta; // device-pointer scal, no host sync
|
||||||
|
p += z;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
benchmark::DoNotOptimize(d_x.data());
|
||||||
|
last_iters = i;
|
||||||
|
last_error = std::sqrt(residualNorm2 / rhsNorm2);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuContext::setThreadLocal(nullptr);
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["nnz"] = H.nonZeros();
|
||||||
|
state.counters["iters"] = last_iters;
|
||||||
|
state.counters["error"] = last_error;
|
||||||
|
state.counters["cameras"] = g_problem.num_cameras;
|
||||||
|
state.counters["points"] = g_problem.num_points;
|
||||||
|
state.counters["observations"] = g_problem.num_observations;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// CPU CG with Jacobi preconditioner (apples-to-apples comparison)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
static void BM_BA_CPU_CG_Jacobi(benchmark::State& state) {
|
||||||
|
ensure_loaded();
|
||||||
|
const auto& H = g_neq.H;
|
||||||
|
const auto& g = g_neq.g;
|
||||||
|
|
||||||
|
// Eigen's DiagonalPreconditioner is effectively Jacobi.
|
||||||
|
ConjugateGradient<SparseMatrix<double, ColMajor, int>, Lower | Upper> cg;
|
||||||
|
cg.setMaxIterations(10000);
|
||||||
|
cg.setTolerance(1e-8);
|
||||||
|
cg.compute(H);
|
||||||
|
|
||||||
|
int last_iters = 0;
|
||||||
|
double last_error = 0;
|
||||||
|
for (auto _ : state) {
|
||||||
|
VectorXd dx = cg.solve(g);
|
||||||
|
benchmark::DoNotOptimize(dx.data());
|
||||||
|
last_iters = cg.iterations();
|
||||||
|
last_error = cg.error();
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = H.rows();
|
||||||
|
state.counters["nnz"] = H.nonZeros();
|
||||||
|
state.counters["iters"] = last_iters;
|
||||||
|
state.counters["error"] = last_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Register benchmarks
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
BENCHMARK(BM_BA_CPU_CG)->Unit(benchmark::kMillisecond);
|
||||||
|
BENCHMARK(BM_BA_CPU_CG_Jacobi)->Unit(benchmark::kMillisecond);
|
||||||
|
BENCHMARK(BM_BA_GPU_CG)->Unit(benchmark::kMillisecond);
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Custom main: print summary after benchmarks
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
benchmark::Initialize(&argc, argv);
|
||||||
|
|
||||||
|
// Print problem info before benchmarks.
|
||||||
|
const char* bal_file = std::getenv("BAL_FILE");
|
||||||
|
if (bal_file) {
|
||||||
|
ensure_loaded();
|
||||||
|
fprintf(stderr,
|
||||||
|
"\n"
|
||||||
|
"=== Bundle Adjustment GPU CG Benchmark ===\n"
|
||||||
|
"BAL file: %s\n"
|
||||||
|
"Cameras: %d\n"
|
||||||
|
"Points: %d\n"
|
||||||
|
"Observations: %d\n"
|
||||||
|
"J size: %d x %d, nnz=%ld\n"
|
||||||
|
"H size: %d x %d, nnz=%ld\n"
|
||||||
|
"|residual|: %.6e\n"
|
||||||
|
"==========================================\n\n",
|
||||||
|
bal_file, g_problem.num_cameras, g_problem.num_points, g_problem.num_observations, g_neq.jacobian_rows,
|
||||||
|
g_neq.jacobian_cols, g_neq.jacobian_nnz, (int)g_neq.H.rows(), (int)g_neq.H.cols(), (long)g_neq.H.nonZeros(),
|
||||||
|
g_neq.residual_norm);
|
||||||
|
}
|
||||||
|
|
||||||
|
benchmark::RunSpecifiedBenchmarks();
|
||||||
|
benchmark::Shutdown();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
268
benchmarks/GPU/bench_gpu_batching.cpp
Normal file
268
benchmarks/GPU/bench_gpu_batching.cpp
Normal file
@@ -0,0 +1,268 @@
|
|||||||
|
// GPU batching benchmarks: multi-stream concurrency for many small solves.
|
||||||
|
//
|
||||||
|
// Each GpuLLT/GpuLU owns its own CUDA stream. This benchmark measures how
|
||||||
|
// well multiple solver instances overlap on the GPU, which is critical for
|
||||||
|
// workloads like robotics (many small systems) and SLAM (batched poses).
|
||||||
|
//
|
||||||
|
// Compares:
|
||||||
|
// 1. Sequential: one solver handles all systems one by one
|
||||||
|
// 2. Batched: N solvers on N streams, all launched before any sync
|
||||||
|
// 3. CPU baseline: Eigen LLT on host
|
||||||
|
//
|
||||||
|
// For Nsight Systems: batched mode should show overlapping kernels on
|
||||||
|
// different streams in the timeline view.
|
||||||
|
//
|
||||||
|
// nsys profile --trace=cuda ./bench_gpu_batching
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
|
||||||
|
#include <Eigen/Cholesky>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
#ifndef SCALAR
|
||||||
|
#define SCALAR double
|
||||||
|
#endif
|
||||||
|
|
||||||
|
using Scalar = SCALAR;
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
|
||||||
|
static Mat make_spd(Index n) {
|
||||||
|
Mat M = Mat::Random(n, n);
|
||||||
|
return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void cuda_warmup() {
|
||||||
|
static bool done = false;
|
||||||
|
if (!done) {
|
||||||
|
void* p;
|
||||||
|
cudaMalloc(&p, 1);
|
||||||
|
cudaFree(p);
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// Sequential: one solver, N systems solved one after another
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_Batch_Sequential(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const int batch_size = static_cast<int>(state.range(1));
|
||||||
|
|
||||||
|
// Pre-generate all SPD matrices and RHS vectors.
|
||||||
|
std::vector<Mat> As(batch_size);
|
||||||
|
std::vector<Mat> Bs(batch_size);
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
As[i] = make_spd(n);
|
||||||
|
Bs[i] = Mat::Random(n, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuLLT<Scalar> llt;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
std::vector<Mat> results(batch_size);
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
llt.compute(As[i]);
|
||||||
|
results[i] = llt.solve(Bs[i]);
|
||||||
|
}
|
||||||
|
benchmark::DoNotOptimize(results.back().data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["batch"] = batch_size;
|
||||||
|
state.counters["total_solves"] = batch_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// Sequential with DeviceMatrix (avoid re-upload of A each iteration)
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_Batch_Sequential_Device(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const int batch_size = static_cast<int>(state.range(1));
|
||||||
|
|
||||||
|
std::vector<Mat> As(batch_size);
|
||||||
|
std::vector<Mat> Bs(batch_size);
|
||||||
|
std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
|
||||||
|
std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
As[i] = make_spd(n);
|
||||||
|
Bs[i] = Mat::Random(n, 1);
|
||||||
|
d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
|
||||||
|
d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuLLT<Scalar> llt;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
std::vector<Mat> results(batch_size);
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
llt.compute(d_As[i]);
|
||||||
|
DeviceMatrix<Scalar> d_X = llt.solve(d_Bs[i]);
|
||||||
|
results[i] = d_X.toHost();
|
||||||
|
}
|
||||||
|
benchmark::DoNotOptimize(results.back().data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["batch"] = batch_size;
|
||||||
|
state.counters["total_solves"] = batch_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// Batched: N solvers on N streams, overlapping execution
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_Batch_MultiStream(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const int batch_size = static_cast<int>(state.range(1));
|
||||||
|
|
||||||
|
std::vector<Mat> As(batch_size);
|
||||||
|
std::vector<Mat> Bs(batch_size);
|
||||||
|
std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
|
||||||
|
std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
As[i] = make_spd(n);
|
||||||
|
Bs[i] = Mat::Random(n, 1);
|
||||||
|
d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
|
||||||
|
d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// N solvers = N independent CUDA streams.
|
||||||
|
std::vector<std::unique_ptr<GpuLLT<Scalar>>> solvers(batch_size);
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
solvers[i] = std::make_unique<GpuLLT<Scalar>>();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
// Phase 1: launch all factorizations (async, different streams).
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
solvers[i]->compute(d_As[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 2: launch all solves (async, different streams).
|
||||||
|
std::vector<DeviceMatrix<Scalar>> d_Xs(batch_size);
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
d_Xs[i] = solvers[i]->solve(d_Bs[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 3: download all results.
|
||||||
|
std::vector<Mat> results(batch_size);
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
results[i] = d_Xs[i].toHost();
|
||||||
|
}
|
||||||
|
benchmark::DoNotOptimize(results.back().data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["batch"] = batch_size;
|
||||||
|
state.counters["streams"] = batch_size;
|
||||||
|
state.counters["total_solves"] = batch_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// Batched with async download (overlap D2H with computation)
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_Batch_MultiStream_AsyncDownload(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const int batch_size = static_cast<int>(state.range(1));
|
||||||
|
|
||||||
|
std::vector<Mat> As(batch_size);
|
||||||
|
std::vector<Mat> Bs(batch_size);
|
||||||
|
std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
|
||||||
|
std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
As[i] = make_spd(n);
|
||||||
|
Bs[i] = Mat::Random(n, 1);
|
||||||
|
d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
|
||||||
|
d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::unique_ptr<GpuLLT<Scalar>>> solvers(batch_size);
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
solvers[i] = std::make_unique<GpuLLT<Scalar>>();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
// Launch all compute + solve.
|
||||||
|
std::vector<DeviceMatrix<Scalar>> d_Xs(batch_size);
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
solvers[i]->compute(d_As[i]);
|
||||||
|
d_Xs[i] = solvers[i]->solve(d_Bs[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enqueue all async downloads.
|
||||||
|
std::vector<HostTransfer<Scalar>> transfers;
|
||||||
|
transfers.reserve(batch_size);
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
transfers.push_back(d_Xs[i].toHostAsync());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect all results.
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
benchmark::DoNotOptimize(transfers[i].get().data());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["batch"] = batch_size;
|
||||||
|
state.counters["streams"] = batch_size;
|
||||||
|
state.counters["total_solves"] = batch_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// CPU baseline: Eigen LLT on host, sequential
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_Batch_CPU(benchmark::State& state) {
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const int batch_size = static_cast<int>(state.range(1));
|
||||||
|
|
||||||
|
std::vector<Mat> As(batch_size);
|
||||||
|
std::vector<Mat> Bs(batch_size);
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
As[i] = make_spd(n);
|
||||||
|
Bs[i] = Mat::Random(n, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
std::vector<Mat> results(batch_size);
|
||||||
|
for (int i = 0; i < batch_size; ++i) {
|
||||||
|
LLT<Mat> llt(As[i]);
|
||||||
|
results[i] = llt.solve(Bs[i]);
|
||||||
|
}
|
||||||
|
benchmark::DoNotOptimize(results.back().data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["batch"] = batch_size;
|
||||||
|
state.counters["total_solves"] = batch_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// Registration
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
// Args: {matrix_size, batch_size}
|
||||||
|
// Small matrices with large batches are the interesting case for multi-stream.
|
||||||
|
BENCHMARK(BM_Batch_Sequential)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_Batch_Sequential_Device)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_Batch_CPU)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||||||
|
|
||||||
|
// Also run larger sizes with moderate batching.
|
||||||
|
BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||||
|
// clang-format on
|
||||||
291
benchmarks/GPU/bench_gpu_cg_sync.cu
Normal file
291
benchmarks/GPU/bench_gpu_cg_sync.cu
Normal file
@@ -0,0 +1,291 @@
|
|||||||
|
// Benchmark: GPU Conjugate Gradient via DeviceMatrix operators.
|
||||||
|
//
|
||||||
|
// Shows the path to running Eigen's CG on GPU with minimal code changes.
|
||||||
|
// The DeviceMatrix benchmark mirrors Eigen's conjugate_gradient() line-by-line.
|
||||||
|
// A raw cuBLAS device-pointer-mode implementation is included as a lower bound.
|
||||||
|
//
|
||||||
|
// The only change needed in Eigen's CG template to support DeviceMatrix:
|
||||||
|
// Line 34: typedef Dest VectorType; (instead of Matrix<Scalar, Dynamic, 1>)
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// cmake --build build-bench-gpu --target bench_gpu_cg_sync
|
||||||
|
// ./build-bench-gpu/bench_gpu_cg_sync
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
#include <cusparse.h>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
using Scalar = double;
|
||||||
|
using RealScalar = double;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
|
||||||
|
static SpMat make_spd(Index n) {
|
||||||
|
SpMat A(n, n);
|
||||||
|
A.reserve(VectorXi::Constant(n, 3));
|
||||||
|
for (Index i = 0; i < n; ++i) {
|
||||||
|
A.insert(i, i) = 4.0;
|
||||||
|
if (i > 0) A.insert(i, i - 1) = -1.0;
|
||||||
|
if (i < n - 1) A.insert(i, i + 1) = -1.0;
|
||||||
|
}
|
||||||
|
A.makeCompressed();
|
||||||
|
return A;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void cuda_warmup() {
|
||||||
|
static bool done = false;
|
||||||
|
if (!done) {
|
||||||
|
void* p;
|
||||||
|
cudaMalloc(&p, 1);
|
||||||
|
cudaFree(p);
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ==========================================================================
|
||||||
|
// GPU CG using DeviceMatrix operators — mirrors Eigen's conjugate_gradient()
|
||||||
|
// ==========================================================================
|
||||||
|
//
|
||||||
|
// Compare with Eigen/src/IterativeLinearSolvers/ConjugateGradient.h lines 29-84.
|
||||||
|
// Left column: Eigen CG code. Right column: this benchmark.
|
||||||
|
//
|
||||||
|
// Eigen CG GPU CG (this benchmark)
|
||||||
|
// -------- -----------------------
|
||||||
|
// VectorType residual = rhs - mat * x; residual.copyFrom(ctx, rhs); [x=0 so r=b]
|
||||||
|
// RealScalar rhsNorm2 = rhs.sqNorm(); RealScalar rhsNorm2 = rhs.squaredNorm();
|
||||||
|
// ...
|
||||||
|
// tmp.noalias() = mat * p; tmp.noalias() = mat * p; [identical]
|
||||||
|
// Scalar alpha = absNew / p.dot(tmp); Scalar alpha = absNew / p.dot(tmp); [identical]
|
||||||
|
// x += alpha * p; x += alpha * p; [identical]
|
||||||
|
// residual -= alpha * tmp; residual -= alpha * tmp; [identical]
|
||||||
|
// residualNorm2 = residual.sqNorm(); residualNorm2 = residual.squaredNorm(); [identical]
|
||||||
|
// ...
|
||||||
|
// p = z + beta * p; p *= beta; p += z; [equivalent, no alloc]
|
||||||
|
|
||||||
|
static void BM_CG_DeviceMatrixOps(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
|
||||||
|
SpMat A = make_spd(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
// One shared context: SpMV + BLAS-1 on same stream, zero event overhead.
|
||||||
|
GpuContext ctx;
|
||||||
|
GpuContext::setThreadLocal(&ctx);
|
||||||
|
GpuSparseContext<Scalar> spmv(ctx);
|
||||||
|
auto mat = spmv.deviceView(A);
|
||||||
|
|
||||||
|
// Upload RHS once.
|
||||||
|
auto rhs = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
// --- Eigen CG lines 34-63: initialization ---
|
||||||
|
// typedef Dest VectorType; // GPU CHANGE: was Matrix<Scalar,Dynamic,1>
|
||||||
|
// VectorType residual = rhs - mat * x; // x=0, so residual = rhs
|
||||||
|
DeviceMatrix<Scalar> x(n, 1);
|
||||||
|
x.setZero();
|
||||||
|
DeviceMatrix<Scalar> residual(n, 1);
|
||||||
|
residual.copyFrom(ctx, rhs);
|
||||||
|
|
||||||
|
// RealScalar rhsNorm2 = rhs.squaredNorm();
|
||||||
|
RealScalar rhsNorm2 = rhs.squaredNorm();
|
||||||
|
if (rhsNorm2 == 0) continue;
|
||||||
|
|
||||||
|
RealScalar tol = 1e-10;
|
||||||
|
const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
|
||||||
|
RealScalar threshold = numext::maxi(RealScalar(tol * tol * rhsNorm2), considerAsZero);
|
||||||
|
|
||||||
|
// RealScalar residualNorm2 = residual.squaredNorm();
|
||||||
|
RealScalar residualNorm2 = residual.squaredNorm();
|
||||||
|
if (residualNorm2 < threshold) continue;
|
||||||
|
|
||||||
|
// VectorType p(n);
|
||||||
|
// p = precond.solve(residual); // no preconditioner: p = residual
|
||||||
|
DeviceMatrix<Scalar> p(n, 1);
|
||||||
|
p.copyFrom(ctx, residual);
|
||||||
|
|
||||||
|
// VectorType z(n), tmp(n);
|
||||||
|
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
|
||||||
|
|
||||||
|
// auto absNew = numext::real(residual.dot(p));
|
||||||
|
// DeviceScalar — stays on device, no sync.
|
||||||
|
auto absNew = residual.dot(p); // DeviceScalar, no sync
|
||||||
|
|
||||||
|
// while (i < maxIters) {
|
||||||
|
Index maxIters = 200;
|
||||||
|
Index i = 0;
|
||||||
|
while (i < maxIters) {
|
||||||
|
// tmp.noalias() = mat * p;
|
||||||
|
tmp.noalias() = mat * p; // SpMV, device-resident
|
||||||
|
|
||||||
|
// auto alpha = absNew / p.dot(tmp);
|
||||||
|
// DeviceScalar / DeviceScalar → device kernel, no sync!
|
||||||
|
auto alpha = absNew / p.dot(tmp); // DeviceScalar, no sync
|
||||||
|
|
||||||
|
// x += alpha * p;
|
||||||
|
// DeviceScalar * DeviceMatrix → device-pointer axpy, no sync!
|
||||||
|
x += alpha * p;
|
||||||
|
|
||||||
|
// residual -= alpha * tmp;
|
||||||
|
residual -= alpha * tmp; // device-pointer axpy, no sync
|
||||||
|
|
||||||
|
// residualNorm2 = residual.squaredNorm();
|
||||||
|
residualNorm2 = residual.squaredNorm(); // THE one sync per iteration
|
||||||
|
|
||||||
|
// if (residualNorm2 < threshold) break;
|
||||||
|
if (residualNorm2 < threshold) break;
|
||||||
|
|
||||||
|
// z = precond.solve(residual);
|
||||||
|
z.copyFrom(ctx, residual); // no preconditioner
|
||||||
|
|
||||||
|
// auto absOld = std::move(absNew);
|
||||||
|
auto absOld = std::move(absNew); // no sync, no alloc
|
||||||
|
|
||||||
|
// absNew = numext::real(residual.dot(z));
|
||||||
|
absNew = residual.dot(z); // DeviceScalar, no sync
|
||||||
|
|
||||||
|
// auto beta = absNew / absOld;
|
||||||
|
// DeviceScalar / DeviceScalar → device kernel, no sync!
|
||||||
|
auto beta = absNew / absOld; // DeviceScalar, no sync
|
||||||
|
|
||||||
|
// p = z + beta * p;
|
||||||
|
p *= beta; // device-pointer scal, no host sync
|
||||||
|
p += z;
|
||||||
|
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuContext::setThreadLocal(nullptr);
|
||||||
|
state.SetItemsProcessed(state.iterations() * 200);
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_CG_DeviceMatrixOps)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
|
||||||
|
|
||||||
|
// ==========================================================================
|
||||||
|
// Raw cuBLAS device-pointer-mode CG (1 sync/iter) — performance lower bound
|
||||||
|
// ==========================================================================
|
||||||
|
|
||||||
|
__global__ void scalar_div_kernel(const Scalar* a, const Scalar* b, Scalar* out) { *out = *a / *b; }
|
||||||
|
__global__ void scalar_neg_kernel(const Scalar* in, Scalar* out) { *out = -(*in); }
|
||||||
|
|
||||||
|
static void BM_CG_DevicePointerMode(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const int maxIters = 200;
|
||||||
|
|
||||||
|
SpMat A = make_spd(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
cudaStream_t stream;
|
||||||
|
cudaStreamCreate(&stream);
|
||||||
|
cublasHandle_t cublas;
|
||||||
|
cublasCreate(&cublas);
|
||||||
|
cublasSetStream(cublas, stream);
|
||||||
|
|
||||||
|
cusparseHandle_t cusparse;
|
||||||
|
cusparseCreate(&cusparse);
|
||||||
|
cusparseSetStream(cusparse, stream);
|
||||||
|
|
||||||
|
internal::DeviceBuffer d_outer((n + 1) * sizeof(int));
|
||||||
|
internal::DeviceBuffer d_inner(A.nonZeros() * sizeof(int));
|
||||||
|
internal::DeviceBuffer d_vals(A.nonZeros() * sizeof(Scalar));
|
||||||
|
cudaMemcpy(d_outer.ptr, A.outerIndexPtr(), (n + 1) * sizeof(int), cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(d_inner.ptr, A.innerIndexPtr(), A.nonZeros() * sizeof(int), cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(d_vals.ptr, A.valuePtr(), A.nonZeros() * sizeof(Scalar), cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
cusparseSpMatDescr_t matA;
|
||||||
|
cusparseCreateCsc(&matA, n, n, A.nonZeros(), d_outer.ptr, d_inner.ptr, d_vals.ptr, CUSPARSE_INDEX_32I,
|
||||||
|
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
|
||||||
|
|
||||||
|
internal::DeviceBuffer d_tmp_buf(n * sizeof(Scalar));
|
||||||
|
cusparseDnVecDescr_t tmp_x, tmp_y;
|
||||||
|
cusparseCreateDnVec(&tmp_x, n, d_tmp_buf.ptr, CUDA_R_64F);
|
||||||
|
cusparseCreateDnVec(&tmp_y, n, d_tmp_buf.ptr, CUDA_R_64F);
|
||||||
|
Scalar spmv_alpha = 1.0, spmv_beta = 0.0;
|
||||||
|
size_t ws_size = 0;
|
||||||
|
cusparseSpMV_bufferSize(cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &spmv_alpha, matA, tmp_x, &spmv_beta, tmp_y,
|
||||||
|
CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &ws_size);
|
||||||
|
internal::DeviceBuffer d_workspace(ws_size);
|
||||||
|
cusparseDestroyDnVec(tmp_x);
|
||||||
|
cusparseDestroyDnVec(tmp_y);
|
||||||
|
|
||||||
|
internal::DeviceBuffer d_x(n * sizeof(Scalar)), d_r(n * sizeof(Scalar));
|
||||||
|
internal::DeviceBuffer d_p(n * sizeof(Scalar)), d_tmp(n * sizeof(Scalar));
|
||||||
|
internal::DeviceBuffer d_b(n * sizeof(Scalar));
|
||||||
|
internal::DeviceBuffer d_absNew(sizeof(Scalar)), d_absOld(sizeof(Scalar));
|
||||||
|
internal::DeviceBuffer d_pdot(sizeof(Scalar)), d_alpha(sizeof(Scalar));
|
||||||
|
internal::DeviceBuffer d_neg_alpha(sizeof(Scalar)), d_beta(sizeof(Scalar));
|
||||||
|
internal::DeviceBuffer d_rnorm(sizeof(RealScalar));
|
||||||
|
|
||||||
|
cudaMemcpy(d_b.ptr, b.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
auto spmv = [&](Scalar* x_ptr, Scalar* y_ptr) {
|
||||||
|
cusparseDnVecDescr_t vx, vy;
|
||||||
|
cusparseCreateDnVec(&vx, n, x_ptr, CUDA_R_64F);
|
||||||
|
cusparseCreateDnVec(&vy, n, y_ptr, CUDA_R_64F);
|
||||||
|
cusparseSpMV(cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &spmv_alpha, matA, vx, &spmv_beta, vy, CUDA_R_64F,
|
||||||
|
CUSPARSE_SPMV_ALG_DEFAULT, d_workspace.ptr);
|
||||||
|
cusparseDestroyDnVec(vx);
|
||||||
|
cusparseDestroyDnVec(vy);
|
||||||
|
};
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
cudaMemsetAsync(static_cast<Scalar*>(d_x.ptr), 0, n * sizeof(Scalar), stream);
|
||||||
|
cudaMemcpyAsync(d_r.ptr, d_b.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
|
||||||
|
cudaMemcpyAsync(d_p.ptr, d_b.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
|
||||||
|
|
||||||
|
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_DEVICE);
|
||||||
|
cublasDdot(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_p.ptr), 1,
|
||||||
|
static_cast<Scalar*>(d_absNew.ptr));
|
||||||
|
|
||||||
|
for (int i = 0; i < maxIters; ++i) {
|
||||||
|
spmv(static_cast<Scalar*>(d_p.ptr), static_cast<Scalar*>(d_tmp.ptr));
|
||||||
|
|
||||||
|
cublasDdot(cublas, n, static_cast<Scalar*>(d_p.ptr), 1, static_cast<Scalar*>(d_tmp.ptr), 1,
|
||||||
|
static_cast<Scalar*>(d_pdot.ptr));
|
||||||
|
|
||||||
|
scalar_div_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_absNew.ptr), static_cast<Scalar*>(d_pdot.ptr),
|
||||||
|
static_cast<Scalar*>(d_alpha.ptr));
|
||||||
|
scalar_neg_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_alpha.ptr), static_cast<Scalar*>(d_neg_alpha.ptr));
|
||||||
|
|
||||||
|
cublasDaxpy(cublas, n, static_cast<Scalar*>(d_alpha.ptr), static_cast<Scalar*>(d_p.ptr), 1,
|
||||||
|
static_cast<Scalar*>(d_x.ptr), 1);
|
||||||
|
cublasDaxpy(cublas, n, static_cast<Scalar*>(d_neg_alpha.ptr), static_cast<Scalar*>(d_tmp.ptr), 1,
|
||||||
|
static_cast<Scalar*>(d_r.ptr), 1);
|
||||||
|
|
||||||
|
cublasDnrm2(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<RealScalar*>(d_rnorm.ptr));
|
||||||
|
|
||||||
|
RealScalar rnorm;
|
||||||
|
cudaMemcpyAsync(&rnorm, d_rnorm.ptr, sizeof(RealScalar), cudaMemcpyDeviceToHost, stream);
|
||||||
|
cudaStreamSynchronize(stream);
|
||||||
|
if (rnorm * rnorm < 1e-20) break;
|
||||||
|
|
||||||
|
cudaMemcpyAsync(d_absOld.ptr, d_absNew.ptr, sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
|
||||||
|
cublasDdot(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_r.ptr), 1,
|
||||||
|
static_cast<Scalar*>(d_absNew.ptr));
|
||||||
|
|
||||||
|
scalar_div_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_absNew.ptr), static_cast<Scalar*>(d_absOld.ptr),
|
||||||
|
static_cast<Scalar*>(d_beta.ptr));
|
||||||
|
|
||||||
|
cublasDscal(cublas, n, static_cast<Scalar*>(d_beta.ptr), static_cast<Scalar*>(d_p.ptr), 1);
|
||||||
|
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_HOST);
|
||||||
|
Scalar one = 1.0;
|
||||||
|
cublasDaxpy(cublas, n, &one, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_p.ptr), 1);
|
||||||
|
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_DEVICE);
|
||||||
|
}
|
||||||
|
cudaStreamSynchronize(stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
state.SetItemsProcessed(state.iterations() * maxIters);
|
||||||
|
cusparseDestroySpMat(matA);
|
||||||
|
cusparseDestroy(cusparse);
|
||||||
|
cublasDestroy(cublas);
|
||||||
|
cudaStreamDestroy(stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_CG_DevicePointerMode)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
|
||||||
216
benchmarks/GPU/bench_gpu_cg_vs_cpu.cu
Normal file
216
benchmarks/GPU/bench_gpu_cg_vs_cpu.cu
Normal file
@@ -0,0 +1,216 @@
|
|||||||
|
// Benchmark: GPU CG vs CPU CG on realistic sparse systems.
|
||||||
|
//
|
||||||
|
// Tests 2D Laplacian (5-point stencil) and 3D Laplacian (7-point stencil)
|
||||||
|
// in both float and double precision.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// cmake --build build-bench-gpu --target bench_gpu_cg_vs_cpu
|
||||||
|
// ./build-bench-gpu/bench_gpu_cg_vs_cpu
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/IterativeLinearSolvers>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Sparse matrix generators -----------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
SparseMatrix<Scalar, ColMajor, int> make_laplacian_2d(int grid_n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
const int n = grid_n * grid_n;
|
||||||
|
SpMat A(n, n);
|
||||||
|
A.reserve(VectorXi::Constant(n, 5));
|
||||||
|
for (int i = 0; i < grid_n; ++i) {
|
||||||
|
for (int j = 0; j < grid_n; ++j) {
|
||||||
|
int idx = i * grid_n + j;
|
||||||
|
A.insert(idx, idx) = Scalar(4);
|
||||||
|
if (i > 0) A.insert(idx, idx - grid_n) = Scalar(-1);
|
||||||
|
if (i < grid_n - 1) A.insert(idx, idx + grid_n) = Scalar(-1);
|
||||||
|
if (j > 0) A.insert(idx, idx - 1) = Scalar(-1);
|
||||||
|
if (j < grid_n - 1) A.insert(idx, idx + 1) = Scalar(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
A.makeCompressed();
|
||||||
|
return A;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
SparseMatrix<Scalar, ColMajor, int> make_laplacian_3d(int grid_n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
const int n = grid_n * grid_n * grid_n;
|
||||||
|
const int n2 = grid_n * grid_n;
|
||||||
|
SpMat A(n, n);
|
||||||
|
A.reserve(VectorXi::Constant(n, 7));
|
||||||
|
for (int i = 0; i < grid_n; ++i) {
|
||||||
|
for (int j = 0; j < grid_n; ++j) {
|
||||||
|
for (int k = 0; k < grid_n; ++k) {
|
||||||
|
int idx = i * n2 + j * grid_n + k;
|
||||||
|
A.insert(idx, idx) = Scalar(6);
|
||||||
|
if (i > 0) A.insert(idx, idx - n2) = Scalar(-1);
|
||||||
|
if (i < grid_n - 1) A.insert(idx, idx + n2) = Scalar(-1);
|
||||||
|
if (j > 0) A.insert(idx, idx - grid_n) = Scalar(-1);
|
||||||
|
if (j < grid_n - 1) A.insert(idx, idx + grid_n) = Scalar(-1);
|
||||||
|
if (k > 0) A.insert(idx, idx - 1) = Scalar(-1);
|
||||||
|
if (k < grid_n - 1) A.insert(idx, idx + 1) = Scalar(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
A.makeCompressed();
|
||||||
|
return A;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void cuda_warmup() {
|
||||||
|
static bool done = false;
|
||||||
|
if (!done) {
|
||||||
|
void* p;
|
||||||
|
cudaMalloc(&p, 1);
|
||||||
|
cudaFree(p);
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- CPU CG -----------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar, typename MatGen>
|
||||||
|
void run_cpu_cg(benchmark::State& state, MatGen make_matrix) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
const int grid_n = state.range(0);
|
||||||
|
SpMat A = make_matrix(grid_n);
|
||||||
|
Vec b = Vec::Random(A.rows());
|
||||||
|
|
||||||
|
ConjugateGradient<SpMat, Lower | Upper> cg;
|
||||||
|
cg.setMaxIterations(10000);
|
||||||
|
cg.setTolerance(RealScalar(1e-8));
|
||||||
|
cg.compute(A);
|
||||||
|
|
||||||
|
int last_iters = 0;
|
||||||
|
for (auto _ : state) {
|
||||||
|
Vec x = cg.solve(b);
|
||||||
|
benchmark::DoNotOptimize(x.data());
|
||||||
|
last_iters = cg.iterations();
|
||||||
|
}
|
||||||
|
state.counters["n"] = A.rows();
|
||||||
|
state.counters["nnz"] = A.nonZeros();
|
||||||
|
state.counters["iters"] = last_iters;
|
||||||
|
state.counters["error"] = cg.error();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GPU CG -----------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar, typename MatGen>
|
||||||
|
void run_gpu_cg(benchmark::State& state, MatGen make_matrix) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
cuda_warmup();
|
||||||
|
const int grid_n = state.range(0);
|
||||||
|
SpMat A = make_matrix(grid_n);
|
||||||
|
const Index n = A.rows();
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
// Extract inverse diagonal.
|
||||||
|
Vec invdiag(n);
|
||||||
|
for (Index j = 0; j < A.outerSize(); ++j) {
|
||||||
|
typename SpMat::InnerIterator it(A, j);
|
||||||
|
while (it && it.index() != j) ++it;
|
||||||
|
if (it && it.index() == j && it.value() != Scalar(0))
|
||||||
|
invdiag(j) = Scalar(1) / it.value();
|
||||||
|
else
|
||||||
|
invdiag(j) = Scalar(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuContext ctx;
|
||||||
|
GpuContext::setThreadLocal(&ctx);
|
||||||
|
GpuSparseContext<Scalar> spmv_ctx(ctx);
|
||||||
|
auto mat = spmv_ctx.deviceView(A);
|
||||||
|
auto d_invdiag = DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());
|
||||||
|
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
|
||||||
|
|
||||||
|
int last_iters = 0;
|
||||||
|
RealScalar last_error = 0;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
DeviceMatrix<Scalar> d_x(n, 1);
|
||||||
|
d_x.setZero(ctx);
|
||||||
|
DeviceMatrix<Scalar> residual(n, 1);
|
||||||
|
residual.copyFrom(ctx, d_b);
|
||||||
|
|
||||||
|
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
|
||||||
|
RealScalar tol = RealScalar(1e-8);
|
||||||
|
RealScalar threshold = tol * tol * rhsNorm2;
|
||||||
|
RealScalar residualNorm2 = residual.squaredNorm(ctx);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
|
||||||
|
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
|
||||||
|
|
||||||
|
auto absNew = residual.dot(ctx, p);
|
||||||
|
Index i = 0;
|
||||||
|
Index maxIters = 10000;
|
||||||
|
while (i < maxIters) {
|
||||||
|
tmp.noalias() = mat * p;
|
||||||
|
auto alpha = absNew / p.dot(ctx, tmp);
|
||||||
|
d_x += alpha * p;
|
||||||
|
residual -= alpha * tmp;
|
||||||
|
|
||||||
|
residualNorm2 = residual.squaredNorm(ctx);
|
||||||
|
if (residualNorm2 < threshold) break;
|
||||||
|
|
||||||
|
z.cwiseProduct(ctx, d_invdiag, residual);
|
||||||
|
auto absOld = std::move(absNew);
|
||||||
|
absNew = residual.dot(ctx, z);
|
||||||
|
auto beta = absNew / absOld;
|
||||||
|
|
||||||
|
p *= beta;
|
||||||
|
p += z;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
benchmark::DoNotOptimize(d_x.data());
|
||||||
|
last_iters = i;
|
||||||
|
last_error = numext::sqrt(residualNorm2 / rhsNorm2);
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuContext::setThreadLocal(nullptr);
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["nnz"] = A.nonZeros();
|
||||||
|
state.counters["iters"] = last_iters;
|
||||||
|
state.counters["error"] = last_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 2D Laplacian, double ---------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_CG_CPU_2D_double(benchmark::State& state) { run_cpu_cg<double>(state, make_laplacian_2d<double>); }
|
||||||
|
static void BM_CG_GPU_2D_double(benchmark::State& state) { run_gpu_cg<double>(state, make_laplacian_2d<double>); }
|
||||||
|
|
||||||
|
BENCHMARK(BM_CG_CPU_2D_double)->ArgsProduct({{32, 64, 128, 256, 512}});
|
||||||
|
BENCHMARK(BM_CG_GPU_2D_double)->ArgsProduct({{32, 64, 128, 256, 512}});
|
||||||
|
|
||||||
|
// ---- 2D Laplacian, float ----------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_CG_CPU_2D_float(benchmark::State& state) { run_cpu_cg<float>(state, make_laplacian_2d<float>); }
|
||||||
|
static void BM_CG_GPU_2D_float(benchmark::State& state) { run_gpu_cg<float>(state, make_laplacian_2d<float>); }
|
||||||
|
|
||||||
|
BENCHMARK(BM_CG_CPU_2D_float)->ArgsProduct({{32, 64, 128, 256, 512}});
|
||||||
|
BENCHMARK(BM_CG_GPU_2D_float)->ArgsProduct({{32, 64, 128, 256, 512}});
|
||||||
|
|
||||||
|
// ---- 3D Laplacian, double ---------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_CG_CPU_3D_double(benchmark::State& state) { run_cpu_cg<double>(state, make_laplacian_3d<double>); }
|
||||||
|
static void BM_CG_GPU_3D_double(benchmark::State& state) { run_gpu_cg<double>(state, make_laplacian_3d<double>); }
|
||||||
|
|
||||||
|
BENCHMARK(BM_CG_CPU_3D_double)->ArgsProduct({{16, 32, 48, 64}});
|
||||||
|
BENCHMARK(BM_CG_GPU_3D_double)->ArgsProduct({{16, 32, 48, 64}});
|
||||||
|
|
||||||
|
// ---- 3D Laplacian, float ----------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_CG_CPU_3D_float(benchmark::State& state) { run_cpu_cg<float>(state, make_laplacian_3d<float>); }
|
||||||
|
static void BM_CG_GPU_3D_float(benchmark::State& state) { run_gpu_cg<float>(state, make_laplacian_3d<float>); }
|
||||||
|
|
||||||
|
BENCHMARK(BM_CG_CPU_3D_float)->ArgsProduct({{16, 32, 48, 64}});
|
||||||
|
BENCHMARK(BM_CG_GPU_3D_float)->ArgsProduct({{16, 32, 48, 64}});
|
||||||
216
benchmarks/GPU/bench_gpu_chaining.cpp
Normal file
216
benchmarks/GPU/bench_gpu_chaining.cpp
Normal file
@@ -0,0 +1,216 @@
|
|||||||
|
// GPU chaining benchmarks: measure async pipeline efficiency.
|
||||||
|
//
|
||||||
|
// Compares:
|
||||||
|
// 1. Host round-trip per solve (baseline)
|
||||||
|
// 2. DeviceMatrix chaining (no host round-trip between solves)
|
||||||
|
// 3. Varying chain lengths (1, 2, 4, 8 consecutive solves)
|
||||||
|
//
|
||||||
|
// For Nsight Systems: look for gaps between kernel launches in the timeline.
|
||||||
|
// Host round-trip creates visible idle gaps; chaining should show back-to-back kernels.
|
||||||
|
//
|
||||||
|
// nsys profile --trace=cuda,nvtx ./bench_gpu_chaining
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
|
||||||
|
#include <Eigen/Cholesky>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
#ifndef SCALAR
|
||||||
|
#define SCALAR double
|
||||||
|
#endif
|
||||||
|
|
||||||
|
using Scalar = SCALAR;
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
|
||||||
|
static Mat make_spd(Index n) {
|
||||||
|
Mat M = Mat::Random(n, n);
|
||||||
|
return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void cuda_warmup() {
|
||||||
|
static bool done = false;
|
||||||
|
if (!done) {
|
||||||
|
void* p;
|
||||||
|
cudaMalloc(&p, 1);
|
||||||
|
cudaFree(p);
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// Baseline: host round-trip between every solve
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_Chain_HostRoundtrip(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const int chain_len = static_cast<int>(state.range(1));
|
||||||
|
|
||||||
|
Mat A = make_spd(n);
|
||||||
|
Mat B = Mat::Random(n, 1);
|
||||||
|
GpuLLT<Scalar> llt(A);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Mat X = B;
|
||||||
|
for (int i = 0; i < chain_len; ++i) {
|
||||||
|
X = llt.solve(X); // host → device → host each time
|
||||||
|
}
|
||||||
|
benchmark::DoNotOptimize(X.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["chain"] = chain_len;
|
||||||
|
state.counters["solves/iter"] = chain_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// DeviceMatrix chaining: no host round-trip between solves
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_Chain_Device(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const int chain_len = static_cast<int>(state.range(1));
|
||||||
|
|
||||||
|
Mat A = make_spd(n);
|
||||||
|
Mat B = Mat::Random(n, 1);
|
||||||
|
GpuLLT<Scalar> llt(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||||
|
for (int i = 1; i < chain_len; ++i) {
|
||||||
|
d_X = llt.solve(d_X); // device → device, fully async
|
||||||
|
}
|
||||||
|
Mat X = d_X.toHost(); // single sync at end
|
||||||
|
benchmark::DoNotOptimize(X.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["chain"] = chain_len;
|
||||||
|
state.counters["solves/iter"] = chain_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// DeviceMatrix chaining with async download (overlap D2H with next iteration)
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_Chain_DeviceAsync(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const int chain_len = static_cast<int>(state.range(1));
|
||||||
|
|
||||||
|
Mat A = make_spd(n);
|
||||||
|
Mat B = Mat::Random(n, 1);
|
||||||
|
GpuLLT<Scalar> llt(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||||
|
for (int i = 1; i < chain_len; ++i) {
|
||||||
|
d_X = llt.solve(d_X);
|
||||||
|
}
|
||||||
|
auto transfer = d_X.toHostAsync();
|
||||||
|
Mat X = transfer.get();
|
||||||
|
benchmark::DoNotOptimize(X.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["chain"] = chain_len;
|
||||||
|
state.counters["solves/iter"] = chain_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// Pure GPU chain (no download — measures kernel-only throughput)
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_Chain_DeviceNoDownload(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const int chain_len = static_cast<int>(state.range(1));
|
||||||
|
|
||||||
|
Mat A = make_spd(n);
|
||||||
|
Mat B = Mat::Random(n, 1);
|
||||||
|
GpuLLT<Scalar> llt(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||||
|
for (int i = 1; i < chain_len; ++i) {
|
||||||
|
d_X = llt.solve(d_X);
|
||||||
|
}
|
||||||
|
cudaStreamSynchronize(llt.stream());
|
||||||
|
benchmark::DoNotOptimize(d_X.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["chain"] = chain_len;
|
||||||
|
state.counters["solves/iter"] = chain_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// Compute + solve chain (full pipeline: factorize, then chain solves)
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_FullPipeline_Host(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const int chain_len = static_cast<int>(state.range(1));
|
||||||
|
|
||||||
|
Mat A = make_spd(n);
|
||||||
|
Mat B = Mat::Random(n, 1);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
GpuLLT<Scalar> llt(A);
|
||||||
|
Mat X = B;
|
||||||
|
for (int i = 0; i < chain_len; ++i) {
|
||||||
|
X = llt.solve(X);
|
||||||
|
}
|
||||||
|
benchmark::DoNotOptimize(X.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["chain"] = chain_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void BM_FullPipeline_Device(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const int chain_len = static_cast<int>(state.range(1));
|
||||||
|
|
||||||
|
Mat A = make_spd(n);
|
||||||
|
Mat B = Mat::Random(n, 1);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
GpuLLT<Scalar> llt;
|
||||||
|
llt.compute(d_A);
|
||||||
|
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||||
|
for (int i = 1; i < chain_len; ++i) {
|
||||||
|
d_X = llt.solve(d_X);
|
||||||
|
}
|
||||||
|
Mat X = d_X.toHost();
|
||||||
|
benchmark::DoNotOptimize(X.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["chain"] = chain_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// Registration
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
// Args: {matrix_size, chain_length}
|
||||||
|
BENCHMARK(BM_Chain_HostRoundtrip)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_Chain_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_Chain_DeviceAsync)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_Chain_DeviceNoDownload)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||||||
|
|
||||||
|
BENCHMARK(BM_FullPipeline_Host)->ArgsProduct({{256, 1024, 4096}, {1, 4}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_FullPipeline_Device)->ArgsProduct({{256, 1024, 4096}, {1, 4}})->Unit(benchmark::kMicrosecond);
|
||||||
|
// clang-format on
|
||||||
185
benchmarks/GPU/bench_gpu_fft.cpp
Normal file
185
benchmarks/GPU/bench_gpu_fft.cpp
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
// GPU FFT benchmarks: GpuFFT 1D and 2D throughput.
|
||||||
|
//
|
||||||
|
// Measures forward and inverse FFT performance across a range of sizes,
|
||||||
|
// including plan-amortized (reuse) and cold-start (new plan) scenarios.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// cmake --build build-bench-gpu --target bench_gpu_fft
|
||||||
|
// ./build-bench-gpu/bench_gpu_fft
|
||||||
|
//
|
||||||
|
// Profiling:
|
||||||
|
// nsys profile --trace=cuda ./build-bench-gpu/bench_gpu_fft
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
#ifndef SCALAR
|
||||||
|
#define SCALAR float
|
||||||
|
#endif
|
||||||
|
|
||||||
|
using Scalar = SCALAR;
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using CVec = Matrix<Complex, Dynamic, 1>;
|
||||||
|
using RVec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using CMat = Matrix<Complex, Dynamic, Dynamic>;
|
||||||
|
|
||||||
|
// CUDA warm-up: ensure the GPU is initialized before timing.
|
||||||
|
static void cuda_warmup() {
|
||||||
|
static bool done = false;
|
||||||
|
if (!done) {
|
||||||
|
void* p;
|
||||||
|
cudaMalloc(&p, 1);
|
||||||
|
cudaFree(p);
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// 1D C2C Forward
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuFFT_1D_C2C_Fwd(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
CVec x = CVec::Random(n);
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
|
||||||
|
// Warm up plan.
|
||||||
|
CVec tmp = fft.fwd(x);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
benchmark::DoNotOptimize(fft.fwd(x));
|
||||||
|
}
|
||||||
|
state.SetItemsProcessed(state.iterations() * n);
|
||||||
|
state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2); // read + write
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuFFT_1D_C2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// 1D C2C Inverse
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuFFT_1D_C2C_Inv(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
CVec x = CVec::Random(n);
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
CVec X = fft.fwd(x);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
benchmark::DoNotOptimize(fft.inv(X));
|
||||||
|
}
|
||||||
|
state.SetItemsProcessed(state.iterations() * n);
|
||||||
|
state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuFFT_1D_C2C_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// 1D R2C Forward
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuFFT_1D_R2C_Fwd(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
RVec r = RVec::Random(n);
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
|
||||||
|
// Warm up plan.
|
||||||
|
CVec tmp = fft.fwd(r);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
benchmark::DoNotOptimize(fft.fwd(r));
|
||||||
|
}
|
||||||
|
state.SetItemsProcessed(state.iterations() * n);
|
||||||
|
state.SetBytesProcessed(state.iterations() * (n * sizeof(Scalar) + (n / 2 + 1) * sizeof(Complex)));
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuFFT_1D_R2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// 1D C2R Inverse
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuFFT_1D_C2R_Inv(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
RVec r = RVec::Random(n);
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
CVec R = fft.fwd(r);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
benchmark::DoNotOptimize(fft.invReal(R, n));
|
||||||
|
}
|
||||||
|
state.SetItemsProcessed(state.iterations() * n);
|
||||||
|
state.SetBytesProcessed(state.iterations() * ((n / 2 + 1) * sizeof(Complex) + n * sizeof(Scalar)));
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuFFT_1D_C2R_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// 2D C2C Forward
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuFFT_2D_C2C_Fwd(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0); // square n x n
|
||||||
|
CMat A = CMat::Random(n, n);
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
|
||||||
|
// Warm up plan.
|
||||||
|
CMat tmp = fft.fwd2d(A);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
benchmark::DoNotOptimize(fft.fwd2d(A));
|
||||||
|
}
|
||||||
|
state.SetItemsProcessed(state.iterations() * n * n);
|
||||||
|
state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuFFT_2D_C2C_Fwd)->RangeMultiplier(2)->Range(64, 4096);
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// 2D C2C Roundtrip (fwd + inv)
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuFFT_2D_C2C_Roundtrip(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
CMat A = CMat::Random(n, n);
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
|
||||||
|
// Warm up plans.
|
||||||
|
CMat tmp = fft.inv2d(fft.fwd2d(A));
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
CMat B = fft.fwd2d(A);
|
||||||
|
benchmark::DoNotOptimize(fft.inv2d(B));
|
||||||
|
}
|
||||||
|
state.SetItemsProcessed(state.iterations() * n * n * 2); // fwd + inv
|
||||||
|
state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuFFT_2D_C2C_Roundtrip)->RangeMultiplier(2)->Range(64, 4096);
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// 1D Cold start (includes plan creation)
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuFFT_1D_ColdStart(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
CVec x = CVec::Random(n);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
GpuFFT<Scalar> fft; // new object = new plans
|
||||||
|
benchmark::DoNotOptimize(fft.fwd(x));
|
||||||
|
}
|
||||||
|
state.SetItemsProcessed(state.iterations() * n);
|
||||||
|
}
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuFFT_1D_ColdStart)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
|
||||||
296
benchmarks/GPU/bench_gpu_solvers.cpp
Normal file
296
benchmarks/GPU/bench_gpu_solvers.cpp
Normal file
@@ -0,0 +1,296 @@
|
|||||||
|
// GPU solver benchmarks: GpuLLT and GpuLU compute + solve throughput.
|
||||||
|
//
|
||||||
|
// Measures factorization and solve performance for the host-matrix and
|
||||||
|
// DeviceMatrix code paths across a range of matrix sizes.
|
||||||
|
//
|
||||||
|
// For Nsight Systems profiling:
|
||||||
|
// nsys profile --trace=cuda,nvtx ./bench_gpu_solvers
|
||||||
|
//
|
||||||
|
// For Nsight Compute kernel analysis:
|
||||||
|
// ncu --set full -o profile ./bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
|
||||||
|
|
||||||
|
#include <benchmark/benchmark.h>
|
||||||
|
|
||||||
|
#include <Eigen/Cholesky>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
#include <Eigen/LU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
#ifndef SCALAR
|
||||||
|
#define SCALAR double
|
||||||
|
#endif
|
||||||
|
|
||||||
|
using Scalar = SCALAR;
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// Helpers
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static Mat make_spd(Index n) {
|
||||||
|
Mat M = Mat::Random(n, n);
|
||||||
|
return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
|
||||||
|
}
|
||||||
|
|
||||||
|
// CUDA warm-up: ensure the GPU is initialized before timing.
|
||||||
|
static void cuda_warmup() {
|
||||||
|
static bool done = false;
|
||||||
|
if (!done) {
|
||||||
|
void* p;
|
||||||
|
cudaMalloc(&p, 1);
|
||||||
|
cudaFree(p);
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// GpuLLT benchmarks
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// Factorize from host matrix (includes H2D upload).
|
||||||
|
static void BM_GpuLLT_Compute_Host(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
Mat A = make_spd(n);
|
||||||
|
GpuLLT<Scalar> llt;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
llt.compute(A);
|
||||||
|
if (llt.info() != Success) state.SkipWithError("factorization failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
state.counters["n"] = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Factorize from DeviceMatrix (D2D copy path).
|
||||||
|
static void BM_GpuLLT_Compute_Device(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
Mat A = make_spd(n);
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
GpuLLT<Scalar> llt;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
llt.compute(d_A);
|
||||||
|
if (llt.info() != Success) state.SkipWithError("factorization failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
state.counters["n"] = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Factorize from DeviceMatrix (move path, no copy).
|
||||||
|
static void BM_GpuLLT_Compute_DeviceMove(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
Mat A = make_spd(n);
|
||||||
|
GpuLLT<Scalar> llt;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
llt.compute(std::move(d_A));
|
||||||
|
if (llt.info() != Success) state.SkipWithError("factorization failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
state.counters["n"] = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Solve from host matrix (H2D + potrs + D2H).
|
||||||
|
static void BM_GpuLLT_Solve_Host(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const Index nrhs = state.range(1);
|
||||||
|
Mat A = make_spd(n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
GpuLLT<Scalar> llt(A);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Mat X = llt.solve(B);
|
||||||
|
benchmark::DoNotOptimize(X.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["nrhs"] = nrhs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Solve from DeviceMatrix (D2D + potrs, async, toHost at end).
|
||||||
|
static void BM_GpuLLT_Solve_Device(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const Index nrhs = state.range(1);
|
||||||
|
Mat A = make_spd(n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
GpuLLT<Scalar> llt(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||||
|
Mat X = d_X.toHost();
|
||||||
|
benchmark::DoNotOptimize(X.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["nrhs"] = nrhs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Solve staying entirely on device (no toHost — measures pure GPU time).
|
||||||
|
static void BM_GpuLLT_Solve_DeviceOnly(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const Index nrhs = state.range(1);
|
||||||
|
Mat A = make_spd(n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
GpuLLT<Scalar> llt(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||||
|
// Force completion without D2H transfer.
|
||||||
|
cudaStreamSynchronize(llt.stream());
|
||||||
|
benchmark::DoNotOptimize(d_X.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["nrhs"] = nrhs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// GpuLU benchmarks
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_GpuLU_Compute_Host(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
GpuLU<Scalar> lu;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
lu.compute(A);
|
||||||
|
if (lu.info() != Success) state.SkipWithError("factorization failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
state.counters["n"] = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void BM_GpuLU_Compute_Device(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
GpuLU<Scalar> lu;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
lu.compute(d_A);
|
||||||
|
if (lu.info() != Success) state.SkipWithError("factorization failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
state.counters["n"] = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void BM_GpuLU_Solve_Host(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const Index nrhs = state.range(1);
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
GpuLU<Scalar> lu(A);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
Mat X = lu.solve(B);
|
||||||
|
benchmark::DoNotOptimize(X.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["nrhs"] = nrhs;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void BM_GpuLU_Solve_Device(benchmark::State& state) {
|
||||||
|
cuda_warmup();
|
||||||
|
const Index n = state.range(0);
|
||||||
|
const Index nrhs = state.range(1);
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
GpuLU<Scalar> lu(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
DeviceMatrix<Scalar> d_X = lu.solve(d_B);
|
||||||
|
Mat X = d_X.toHost();
|
||||||
|
benchmark::DoNotOptimize(X.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
state.counters["n"] = n;
|
||||||
|
state.counters["nrhs"] = nrhs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// CPU baselines for comparison
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
static void BM_CpuLLT_Compute(benchmark::State& state) {
|
||||||
|
const Index n = state.range(0);
|
||||||
|
Mat A = make_spd(n);
|
||||||
|
LLT<Mat> llt;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
llt.compute(A);
|
||||||
|
benchmark::DoNotOptimize(llt.matrixLLT().data());
|
||||||
|
}
|
||||||
|
|
||||||
|
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
state.counters["n"] = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void BM_CpuLU_Compute(benchmark::State& state) {
|
||||||
|
const Index n = state.range(0);
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
PartialPivLU<Mat> lu;
|
||||||
|
|
||||||
|
for (auto _ : state) {
|
||||||
|
lu.compute(A);
|
||||||
|
benchmark::DoNotOptimize(lu.matrixLU().data());
|
||||||
|
}
|
||||||
|
|
||||||
|
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
|
||||||
|
state.counters["GFLOPS"] =
|
||||||
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||||||
|
state.counters["n"] = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
// Registration
|
||||||
|
// --------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
BENCHMARK(BM_GpuLLT_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_GpuLLT_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_GpuLLT_Compute_DeviceMove)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_GpuLLT_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_GpuLLT_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_GpuLLT_Solve_DeviceOnly)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||||
|
|
||||||
|
BENCHMARK(BM_GpuLU_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_GpuLU_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_GpuLU_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_GpuLU_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||||||
|
|
||||||
|
BENCHMARK(BM_CpuLLT_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||||
|
BENCHMARK(BM_CpuLU_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||||||
|
// clang-format on
|
||||||
@@ -211,20 +211,20 @@ build:linux:x86-64:nvhpc-26.1:default:unsupported:
|
|||||||
# Build on regular linux to limit GPU cost.
|
# Build on regular linux to limit GPU cost.
|
||||||
- saas-linux-2xlarge-amd64
|
- saas-linux-2xlarge-amd64
|
||||||
|
|
||||||
# GCC-10, CUDA-12.2
|
# GCC-11, CUDA-12.2
|
||||||
build:linux:cuda-12.2:gcc-10:
|
build:linux:cuda-12.2:gcc-11:
|
||||||
extends: .build:linux:cuda
|
extends: .build:linux:cuda
|
||||||
image: nvidia/cuda:12.2.0-devel-ubuntu20.04
|
image: nvidia/cuda:12.2.0-devel-ubuntu22.04
|
||||||
variables:
|
variables:
|
||||||
EIGEN_CI_C_COMPILER: gcc-10
|
EIGEN_CI_C_COMPILER: gcc-11
|
||||||
EIGEN_CI_CXX_COMPILER: g++-10
|
EIGEN_CI_CXX_COMPILER: g++-11
|
||||||
|
|
||||||
# Clang-12, CUDA-12.2
|
# Clang-14, CUDA-12.2
|
||||||
build:linux:cuda-12.2:clang-12:
|
build:linux:cuda-12.2:clang-14:
|
||||||
extends: build:linux:cuda-12.2:gcc-10
|
extends: build:linux:cuda-12.2:gcc-11
|
||||||
variables:
|
variables:
|
||||||
EIGEN_CI_C_COMPILER: clang-12
|
EIGEN_CI_C_COMPILER: clang-14
|
||||||
EIGEN_CI_CXX_COMPILER: clang++-12
|
EIGEN_CI_CXX_COMPILER: clang++-14
|
||||||
EIGEN_CI_TEST_CUDA_CLANG: "on"
|
EIGEN_CI_TEST_CUDA_CLANG: "on"
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -265,23 +265,23 @@ test:linux:x86-64:nvhpc-26.1:default:unsupported:
|
|||||||
tags:
|
tags:
|
||||||
- saas-linux-medium-amd64-gpu-standard
|
- saas-linux-medium-amd64-gpu-standard
|
||||||
|
|
||||||
# GCC-10, CUDA-12.2
|
# GCC-11, CUDA-12.2
|
||||||
test:linux:cuda-12.2:gcc-10:
|
test:linux:cuda-12.2:gcc-11:
|
||||||
extends: .test:linux:cuda
|
extends: .test:linux:cuda
|
||||||
image: nvidia/cuda:12.2.0-devel-ubuntu20.04
|
image: nvidia/cuda:12.2.0-devel-ubuntu22.04
|
||||||
needs: [ build:linux:cuda-12.2:gcc-10 ]
|
needs: [ build:linux:cuda-12.2:gcc-11 ]
|
||||||
variables:
|
variables:
|
||||||
EIGEN_CI_CXX_COMPILER: g++-10
|
EIGEN_CI_CXX_COMPILER: g++-11
|
||||||
EIGEN_CI_CC_COMPILER: gcc-10
|
EIGEN_CI_CC_COMPILER: gcc-11
|
||||||
|
|
||||||
# Clang-12, CUDA-12.2
|
# Clang-14, CUDA-12.2
|
||||||
test:linux:cuda-12.2:clang-12:
|
test:linux:cuda-12.2:clang-14:
|
||||||
extends: .test:linux:cuda
|
extends: .test:linux:cuda
|
||||||
image: nvidia/cuda:12.2.0-devel-ubuntu20.04
|
image: nvidia/cuda:12.2.0-devel-ubuntu22.04
|
||||||
needs: [ build:linux:cuda-12.2:clang-12 ]
|
needs: [ build:linux:cuda-12.2:clang-14 ]
|
||||||
variables:
|
variables:
|
||||||
EIGEN_CI_CXX_COMPILER: clang++-12
|
EIGEN_CI_CXX_COMPILER: clang++-14
|
||||||
EIGEN_CI_CC_COMPILER: clang-12
|
EIGEN_CI_CC_COMPILER: clang-14
|
||||||
|
|
||||||
|
|
||||||
##### arm ######################################################################
|
##### arm ######################################################################
|
||||||
|
|||||||
@@ -479,6 +479,170 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
|
|||||||
|
|
||||||
ei_add_test(gpu_example)
|
ei_add_test(gpu_example)
|
||||||
ei_add_test(gpu_basic)
|
ei_add_test(gpu_basic)
|
||||||
|
ei_add_test(gpu_library_example "" "CUDA::cusolver")
|
||||||
|
|
||||||
|
# DeviceMatrix tests: CUDA runtime + cuBLAS + cuSOLVER (for BLAS-1 ops via GpuContext).
|
||||||
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
|
add_executable(gpu_device_matrix gpu_device_matrix.cpp)
|
||||||
|
target_include_directories(gpu_device_matrix PRIVATE
|
||||||
|
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||||
|
target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart CUDA::cublas CUDA::cusolver CUDA::npps CUDA::nppc)
|
||||||
|
target_compile_definitions(gpu_device_matrix PRIVATE
|
||||||
|
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||||
|
EIGEN_TEST_PART_ALL=1)
|
||||||
|
add_test(NAME gpu_device_matrix COMMAND gpu_device_matrix)
|
||||||
|
add_dependencies(buildtests gpu_device_matrix)
|
||||||
|
add_dependencies(buildtests_gpu gpu_device_matrix)
|
||||||
|
set_property(TEST gpu_device_matrix APPEND PROPERTY LABELS "Official;gpu")
|
||||||
|
set_property(TEST gpu_device_matrix PROPERTY SKIP_RETURN_CODE 77)
|
||||||
|
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||||
|
|
||||||
|
# Library-specific GPU tests (activated by later phases, OFF by default).
|
||||||
|
# CUDAToolkit imported targets (CUDA::cublas, etc.) are available from
|
||||||
|
# find_package(CUDAToolkit) above.
|
||||||
|
option(EIGEN_TEST_CUBLAS "Test cuBLAS integration" OFF)
|
||||||
|
if(EIGEN_TEST_CUBLAS AND TARGET CUDA::cublas)
|
||||||
|
# cuBLAS tests are plain .cpp files (no device code), like cuSOLVER tests.
|
||||||
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
|
add_executable(gpu_cublas gpu_cublas.cpp)
|
||||||
|
target_include_directories(gpu_cublas PRIVATE
|
||||||
|
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||||
|
target_link_libraries(gpu_cublas
|
||||||
|
Eigen3::Eigen CUDA::cudart CUDA::cublas CUDA::cusolver)
|
||||||
|
target_compile_definitions(gpu_cublas PRIVATE
|
||||||
|
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||||
|
EIGEN_TEST_PART_ALL=1)
|
||||||
|
add_test(NAME gpu_cublas COMMAND gpu_cublas)
|
||||||
|
add_dependencies(buildtests gpu_cublas)
|
||||||
|
add_dependencies(buildtests_gpu gpu_cublas)
|
||||||
|
set_property(TEST gpu_cublas APPEND PROPERTY LABELS "Official;gpu")
|
||||||
|
set_property(TEST gpu_cublas PROPERTY SKIP_RETURN_CODE 77)
|
||||||
|
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
option(EIGEN_TEST_CUSOLVER "Test cuSOLVER integration" OFF)
|
||||||
|
if(EIGEN_TEST_CUSOLVER AND TARGET CUDA::cusolver)
|
||||||
|
# cuSOLVER tests are plain .cpp files: no device code, compiled by the host
|
||||||
|
# compiler and linked against CUDA runtime + cuSOLVER. This avoids NVCC
|
||||||
|
# instantiating Eigen's CPU packet operations for CUDA vector types.
|
||||||
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
|
foreach(_cusolver_test IN ITEMS gpu_cusolver_llt gpu_cusolver_lu gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen)
|
||||||
|
add_executable(${_cusolver_test} ${_cusolver_test}.cpp)
|
||||||
|
target_include_directories(${_cusolver_test} PRIVATE
|
||||||
|
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||||
|
target_link_libraries(${_cusolver_test}
|
||||||
|
Eigen3::Eigen CUDA::cudart CUDA::cusolver CUDA::cublas)
|
||||||
|
target_compile_definitions(${_cusolver_test} PRIVATE
|
||||||
|
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||||
|
EIGEN_TEST_PART_ALL=1)
|
||||||
|
add_test(NAME ${_cusolver_test} COMMAND "${_cusolver_test}")
|
||||||
|
add_dependencies(buildtests ${_cusolver_test})
|
||||||
|
add_dependencies(buildtests_gpu ${_cusolver_test})
|
||||||
|
set_property(TEST ${_cusolver_test} APPEND PROPERTY LABELS "Official;gpu")
|
||||||
|
set_property(TEST ${_cusolver_test} PROPERTY SKIP_RETURN_CODE 77)
|
||||||
|
endforeach()
|
||||||
|
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# cuFFT test (cuFFT is part of the CUDA toolkit — no separate option needed).
|
||||||
|
if(TARGET CUDA::cufft)
|
||||||
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
|
add_executable(gpu_cufft gpu_cufft.cpp)
|
||||||
|
target_include_directories(gpu_cufft PRIVATE
|
||||||
|
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||||
|
target_link_libraries(gpu_cufft
|
||||||
|
Eigen3::Eigen CUDA::cudart CUDA::cufft CUDA::cublas)
|
||||||
|
target_compile_definitions(gpu_cufft PRIVATE
|
||||||
|
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||||
|
EIGEN_TEST_PART_ALL=1)
|
||||||
|
add_test(NAME gpu_cufft COMMAND gpu_cufft)
|
||||||
|
add_dependencies(buildtests gpu_cufft)
|
||||||
|
add_dependencies(buildtests_gpu gpu_cufft)
|
||||||
|
set_property(TEST gpu_cufft APPEND PROPERTY LABELS "Official;gpu")
|
||||||
|
set_property(TEST gpu_cufft PROPERTY SKIP_RETURN_CODE 77)
|
||||||
|
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# cuSPARSE SpMV test (cuSPARSE is part of the CUDA toolkit).
|
||||||
|
if(TARGET CUDA::cusparse)
|
||||||
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
|
add_executable(gpu_cusparse_spmv gpu_cusparse_spmv.cpp)
|
||||||
|
target_include_directories(gpu_cusparse_spmv PRIVATE
|
||||||
|
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||||
|
target_link_libraries(gpu_cusparse_spmv
|
||||||
|
Eigen3::Eigen CUDA::cudart CUDA::cusparse CUDA::cublas CUDA::cusolver)
|
||||||
|
target_compile_definitions(gpu_cusparse_spmv PRIVATE
|
||||||
|
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||||
|
EIGEN_TEST_PART_ALL=1)
|
||||||
|
add_test(NAME gpu_cusparse_spmv COMMAND gpu_cusparse_spmv)
|
||||||
|
add_dependencies(buildtests gpu_cusparse_spmv)
|
||||||
|
add_dependencies(buildtests_gpu gpu_cusparse_spmv)
|
||||||
|
set_property(TEST gpu_cusparse_spmv APPEND PROPERTY LABELS "Official;gpu")
|
||||||
|
set_property(TEST gpu_cusparse_spmv PROPERTY SKIP_RETURN_CODE 77)
|
||||||
|
|
||||||
|
# End-to-end GPU CG test: Eigen's ConjugateGradient with DeviceMatrix.
|
||||||
|
add_executable(gpu_cg gpu_cg.cpp)
|
||||||
|
target_include_directories(gpu_cg PRIVATE
|
||||||
|
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||||
|
target_link_libraries(gpu_cg
|
||||||
|
Eigen3::Eigen CUDA::cudart CUDA::cusparse CUDA::cublas CUDA::cusolver CUDA::npps CUDA::nppc)
|
||||||
|
target_compile_definitions(gpu_cg PRIVATE
|
||||||
|
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||||
|
EIGEN_TEST_PART_ALL=1)
|
||||||
|
add_test(NAME gpu_cg COMMAND gpu_cg)
|
||||||
|
add_dependencies(buildtests gpu_cg)
|
||||||
|
add_dependencies(buildtests_gpu gpu_cg)
|
||||||
|
set_property(TEST gpu_cg APPEND PROPERTY LABELS "Official;gpu")
|
||||||
|
set_property(TEST gpu_cg PROPERTY SKIP_RETURN_CODE 77)
|
||||||
|
|
||||||
|
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
option(EIGEN_TEST_CUSPARSE "Test cuSPARSE integration" OFF)
|
||||||
|
if(EIGEN_TEST_CUSPARSE AND TARGET CUDA::cusparse)
|
||||||
|
ei_add_test(gpu_cusparse "" "CUDA::cusparse")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# cuDSS sparse direct solver tests.
|
||||||
|
# cuDSS is distributed separately from the CUDA Toolkit.
|
||||||
|
option(EIGEN_TEST_CUDSS "Test cuDSS sparse solver integration" OFF)
|
||||||
|
if(EIGEN_TEST_CUDSS)
|
||||||
|
find_path(CUDSS_INCLUDE_DIR cudss.h
|
||||||
|
HINTS ${CUDSS_DIR}/include ${CUDA_TOOLKIT_ROOT_DIR}/include /usr/include)
|
||||||
|
find_library(CUDSS_LIBRARY cudss
|
||||||
|
HINTS ${CUDSS_DIR}/lib ${CUDSS_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib/x86_64-linux-gnu)
|
||||||
|
if(CUDSS_INCLUDE_DIR AND CUDSS_LIBRARY)
|
||||||
|
message(STATUS "cuDSS found: ${CUDSS_LIBRARY}")
|
||||||
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
|
foreach(_cudss_test IN ITEMS gpu_cudss_llt gpu_cudss_ldlt gpu_cudss_lu)
|
||||||
|
add_executable(${_cudss_test} ${_cudss_test}.cpp)
|
||||||
|
target_include_directories(${_cudss_test} PRIVATE
|
||||||
|
"${CUDA_TOOLKIT_ROOT_DIR}/include"
|
||||||
|
"${CUDSS_INCLUDE_DIR}"
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}")
|
||||||
|
target_link_libraries(${_cudss_test}
|
||||||
|
Eigen3::Eigen CUDA::cudart CUDA::cusolver CUDA::cublas ${CUDSS_LIBRARY})
|
||||||
|
target_compile_definitions(${_cudss_test} PRIVATE
|
||||||
|
EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
|
||||||
|
EIGEN_TEST_PART_ALL=1
|
||||||
|
EIGEN_CUDSS=1)
|
||||||
|
add_test(NAME ${_cudss_test} COMMAND "${_cudss_test}")
|
||||||
|
add_dependencies(buildtests ${_cudss_test})
|
||||||
|
add_dependencies(buildtests_gpu ${_cudss_test})
|
||||||
|
set_property(TEST ${_cudss_test} APPEND PROPERTY LABELS "Official;gpu")
|
||||||
|
set_property(TEST ${_cudss_test} PROPERTY SKIP_RETURN_CODE 77)
|
||||||
|
endforeach()
|
||||||
|
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
|
||||||
|
else()
|
||||||
|
message(WARNING "EIGEN_TEST_CUDSS=ON but cuDSS not found. Set CUDSS_DIR.")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
|
||||||
|
|
||||||
|
|||||||
224
test/gpu_cg.cpp
Normal file
224
test/gpu_cg.cpp
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// End-to-end test: CG algorithm running on GPU via DeviceMatrix.
|
||||||
|
//
|
||||||
|
// Uses DeviceSparseView for SpMV, DeviceMatrix for vectors, DeviceScalar
|
||||||
|
// for deferred reductions. Verifies correctness against CPU ConjugateGradient.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/IterativeLinearSolvers>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Helper: build a sparse SPD matrix --------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat R(n, n);
|
||||||
|
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
|
||||||
|
for (Index j = 0; j < n; ++j) {
|
||||||
|
for (Index i = 0; i < n; ++i) {
|
||||||
|
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
|
||||||
|
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
R.makeCompressed();
|
||||||
|
SpMat A = R.adjoint() * R;
|
||||||
|
for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
|
||||||
|
A.makeCompressed();
|
||||||
|
return A;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GPU CG without preconditioner ------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gpu_cg(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_spd<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
// CPU reference (identity preconditioner to match GPU).
|
||||||
|
ConjugateGradient<SpMat, Lower | Upper, IdentityPreconditioner> cpu_cg;
|
||||||
|
cpu_cg.setMaxIterations(1000);
|
||||||
|
cpu_cg.setTolerance(RealScalar(1e-8));
|
||||||
|
cpu_cg.compute(A);
|
||||||
|
Vec x_cpu = cpu_cg.solve(b);
|
||||||
|
VERIFY_IS_EQUAL(cpu_cg.info(), Success);
|
||||||
|
|
||||||
|
// GPU CG: mirrors Eigen's conjugate_gradient() using DeviceMatrix ops.
|
||||||
|
GpuContext ctx;
|
||||||
|
GpuContext::setThreadLocal(&ctx);
|
||||||
|
GpuSparseContext<Scalar> spmv_ctx(ctx);
|
||||||
|
auto mat = spmv_ctx.deviceView(A);
|
||||||
|
|
||||||
|
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
|
||||||
|
DeviceMatrix<Scalar> d_x(n, 1);
|
||||||
|
d_x.setZero(ctx);
|
||||||
|
|
||||||
|
// r = b (since x=0)
|
||||||
|
DeviceMatrix<Scalar> residual(n, 1);
|
||||||
|
residual.copyFrom(ctx, d_b);
|
||||||
|
|
||||||
|
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
|
||||||
|
RealScalar tol = RealScalar(1e-8);
|
||||||
|
RealScalar threshold = tol * tol * rhsNorm2;
|
||||||
|
RealScalar residualNorm2 = residual.squaredNorm(ctx);
|
||||||
|
|
||||||
|
// p = r (no preconditioner)
|
||||||
|
DeviceMatrix<Scalar> p(n, 1);
|
||||||
|
p.copyFrom(ctx, residual);
|
||||||
|
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
|
||||||
|
|
||||||
|
auto absNew = residual.dot(ctx, p);
|
||||||
|
Index maxIters = 1000;
|
||||||
|
Index i = 0;
|
||||||
|
while (i < maxIters) {
|
||||||
|
tmp.noalias() = mat * p;
|
||||||
|
|
||||||
|
auto alpha = absNew / p.dot(ctx, tmp);
|
||||||
|
d_x += alpha * p;
|
||||||
|
residual -= alpha * tmp;
|
||||||
|
|
||||||
|
residualNorm2 = residual.squaredNorm(ctx);
|
||||||
|
if (residualNorm2 < threshold) break;
|
||||||
|
|
||||||
|
// z = r (no preconditioner)
|
||||||
|
z.copyFrom(ctx, residual);
|
||||||
|
|
||||||
|
auto absOld = std::move(absNew);
|
||||||
|
absNew = residual.dot(ctx, z);
|
||||||
|
auto beta = absNew / absOld;
|
||||||
|
|
||||||
|
p *= Scalar(beta);
|
||||||
|
p += z;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuContext::setThreadLocal(nullptr);
|
||||||
|
|
||||||
|
Vec x_gpu = d_x.toHost(ctx.stream());
|
||||||
|
|
||||||
|
// Verify residual.
|
||||||
|
Vec r = A * x_gpu - b;
|
||||||
|
RealScalar relres = r.norm() / b.norm();
|
||||||
|
VERIFY(relres < RealScalar(1e-6));
|
||||||
|
|
||||||
|
// Compare with CPU.
|
||||||
|
RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GPU CG with Jacobi preconditioner --------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gpu_cg_jacobi(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_spd<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
// CPU reference.
|
||||||
|
ConjugateGradient<SpMat, Lower | Upper> cpu_cg;
|
||||||
|
cpu_cg.setMaxIterations(1000);
|
||||||
|
cpu_cg.setTolerance(RealScalar(1e-8));
|
||||||
|
cpu_cg.compute(A);
|
||||||
|
Vec x_cpu = cpu_cg.solve(b);
|
||||||
|
|
||||||
|
// Extract inverse diagonal.
|
||||||
|
Vec invdiag(n);
|
||||||
|
for (Index j = 0; j < A.outerSize(); ++j) {
|
||||||
|
typename SpMat::InnerIterator it(A, j);
|
||||||
|
while (it && it.index() != j) ++it;
|
||||||
|
if (it && it.index() == j && it.value() != Scalar(0))
|
||||||
|
invdiag(j) = Scalar(1) / it.value();
|
||||||
|
else
|
||||||
|
invdiag(j) = Scalar(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// GPU CG with Jacobi preconditioner.
|
||||||
|
GpuContext ctx;
|
||||||
|
GpuContext::setThreadLocal(&ctx);
|
||||||
|
GpuSparseContext<Scalar> spmv_ctx(ctx);
|
||||||
|
auto mat = spmv_ctx.deviceView(A);
|
||||||
|
auto d_invdiag = DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());
|
||||||
|
|
||||||
|
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
|
||||||
|
DeviceMatrix<Scalar> d_x(n, 1);
|
||||||
|
d_x.setZero(ctx);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> residual(n, 1);
|
||||||
|
residual.copyFrom(ctx, d_b);
|
||||||
|
|
||||||
|
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
|
||||||
|
RealScalar tol = RealScalar(1e-8);
|
||||||
|
RealScalar threshold = tol * tol * rhsNorm2;
|
||||||
|
RealScalar residualNorm2 = residual.squaredNorm(ctx);
|
||||||
|
|
||||||
|
// p = precond.solve(r) = invdiag .* r
|
||||||
|
DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
|
||||||
|
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
|
||||||
|
|
||||||
|
auto absNew = residual.dot(ctx, p);
|
||||||
|
Index maxIters = 1000;
|
||||||
|
Index i = 0;
|
||||||
|
while (i < maxIters) {
|
||||||
|
tmp.noalias() = mat * p;
|
||||||
|
|
||||||
|
auto alpha = absNew / p.dot(ctx, tmp);
|
||||||
|
d_x += alpha * p;
|
||||||
|
residual -= alpha * tmp;
|
||||||
|
|
||||||
|
residualNorm2 = residual.squaredNorm(ctx);
|
||||||
|
if (residualNorm2 < threshold) break;
|
||||||
|
|
||||||
|
// z = precond.solve(r) = invdiag .* r
|
||||||
|
z.cwiseProduct(ctx, d_invdiag, residual);
|
||||||
|
|
||||||
|
auto absOld = std::move(absNew);
|
||||||
|
absNew = residual.dot(ctx, z);
|
||||||
|
auto beta = absNew / absOld;
|
||||||
|
|
||||||
|
p *= beta;
|
||||||
|
p += z;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuContext::setThreadLocal(nullptr);
|
||||||
|
|
||||||
|
Vec x_gpu = d_x.toHost(ctx.stream());
|
||||||
|
|
||||||
|
Vec r = A * x_gpu - b;
|
||||||
|
RealScalar relres = r.norm() / b.norm();
|
||||||
|
VERIFY(relres < RealScalar(1e-6));
|
||||||
|
|
||||||
|
RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cg) {
|
||||||
|
CALL_SUBTEST(test_gpu_cg<double>(64));
|
||||||
|
CALL_SUBTEST(test_gpu_cg<double>(256));
|
||||||
|
CALL_SUBTEST(test_gpu_cg<float>(64));
|
||||||
|
CALL_SUBTEST(test_gpu_cg_jacobi<double>(64));
|
||||||
|
CALL_SUBTEST(test_gpu_cg_jacobi<double>(256));
|
||||||
|
CALL_SUBTEST(test_gpu_cg_jacobi<float>(64));
|
||||||
|
}
|
||||||
72
test/gpu_context.h
Normal file
72
test/gpu_context.h
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
#ifndef EIGEN_TEST_GPU_CONTEXT_H
|
||||||
|
#define EIGEN_TEST_GPU_CONTEXT_H
|
||||||
|
|
||||||
|
// RAII context for GPU tests that use NVIDIA library APIs (cuBLAS, cuSOLVER, etc.).
|
||||||
|
// Owns a non-default CUDA stream. Library handles (cuBLAS, cuSOLVER, etc.) are added
|
||||||
|
// here by each integration phase as needed; each handle is bound to the owned stream.
|
||||||
|
//
|
||||||
|
// Usage:
|
||||||
|
// GpuContext ctx;
|
||||||
|
// auto buf = gpu_copy_to_device(ctx.stream, A);
|
||||||
|
// // ... call NVIDIA library APIs using ctx.stream / ctx.cusolver ...
|
||||||
|
// ctx.synchronize();
|
||||||
|
|
||||||
|
#include "gpu_test_helper.h"
|
||||||
|
|
||||||
|
#ifdef EIGEN_USE_GPU
|
||||||
|
#include <cusolverDn.h>
|
||||||
|
|
||||||
|
// Checks cuSOLVER return codes, aborts on failure.
|
||||||
|
#define CUSOLVER_CHECK(expr) \
|
||||||
|
do { \
|
||||||
|
cusolverStatus_t _status = (expr); \
|
||||||
|
if (_status != CUSOLVER_STATUS_SUCCESS) { \
|
||||||
|
printf("cuSOLVER error %d at %s:%d\n", static_cast<int>(_status), __FILE__, __LINE__); \
|
||||||
|
gpu_assert(false); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
struct GpuContext {
|
||||||
|
cudaStream_t stream = nullptr;
|
||||||
|
cusolverDnHandle_t cusolver = nullptr;
|
||||||
|
|
||||||
|
GpuContext() {
|
||||||
|
GPU_CHECK(gpuGetDevice(&device_));
|
||||||
|
GPU_CHECK(gpuGetDeviceProperties(&device_props_, device_));
|
||||||
|
GPU_CHECK(cudaStreamCreate(&stream));
|
||||||
|
CUSOLVER_CHECK(cusolverDnCreate(&cusolver));
|
||||||
|
CUSOLVER_CHECK(cusolverDnSetStream(cusolver, stream));
|
||||||
|
}
|
||||||
|
|
||||||
|
~GpuContext() {
|
||||||
|
if (cusolver) CUSOLVER_CHECK(cusolverDnDestroy(cusolver));
|
||||||
|
if (stream) GPU_CHECK(cudaStreamDestroy(stream));
|
||||||
|
}
|
||||||
|
|
||||||
|
int device() const { return device_; }
|
||||||
|
const gpuDeviceProp_t& deviceProperties() const { return device_props_; }
|
||||||
|
|
||||||
|
// Wait for all work submitted on this context's stream to complete.
|
||||||
|
void synchronize() { GPU_CHECK(cudaStreamSynchronize(stream)); }
|
||||||
|
|
||||||
|
// Non-copyable, non-movable.
|
||||||
|
GpuContext(const GpuContext&) = delete;
|
||||||
|
GpuContext& operator=(const GpuContext&) = delete;
|
||||||
|
|
||||||
|
private:
|
||||||
|
int device_ = 0;
|
||||||
|
gpuDeviceProp_t device_props_;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // EIGEN_USE_GPU
|
||||||
|
|
||||||
|
#endif // EIGEN_TEST_GPU_CONTEXT_H
|
||||||
756
test/gpu_cublas.cpp
Normal file
756
test/gpu_cublas.cpp
Normal file
@@ -0,0 +1,756 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for cuBLAS GEMM dispatch via DeviceMatrix expression syntax.
|
||||||
|
// Covers: d_C = d_A * d_B, adjoint, transpose, scaled, +=, .device(ctx).
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// Unit roundoff for GPU GEMM compute precision.
|
||||||
|
// TF32 (opt-in via EIGEN_CUDA_TF32) has eps ~ 2^{-10}.
|
||||||
|
template <typename Scalar>
|
||||||
|
typename NumTraits<Scalar>::Real gpu_unit_roundoff() {
|
||||||
|
#if defined(EIGEN_CUDA_TF32) && !defined(EIGEN_NO_CUDA_TENSOR_OPS)
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
if (std::is_same<RealScalar, float>::value) return RealScalar(9.8e-4);
|
||||||
|
#endif
|
||||||
|
return NumTraits<Scalar>::epsilon();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Higham-Mary probabilistic error bound for GEMM:
|
||||||
|
// ||C - fl(C)||_F <= lambda * sqrt(k) * u * ||A||_F * ||B||_F
|
||||||
|
// where k is the inner dimension, u is the unit roundoff, and
|
||||||
|
// lambda = sqrt(2 * ln(2/delta)) with delta = failure probability.
|
||||||
|
// lambda = 5 corresponds to delta ~ 10^{-6}.
|
||||||
|
// Reference: Higham & Mary, "Probabilistic Error Analysis for Inner Products",
|
||||||
|
// SIAM J. Matrix Anal. Appl., 2019.
|
||||||
|
template <typename Scalar>
|
||||||
|
typename NumTraits<Scalar>::Real gemm_error_bound(Index k, typename NumTraits<Scalar>::Real normA,
|
||||||
|
typename NumTraits<Scalar>::Real normB) {
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
constexpr RealScalar lambda = 5;
|
||||||
|
return lambda * std::sqrt(static_cast<RealScalar>(k)) * gpu_unit_roundoff<Scalar>() * normA * normB;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Basic GEMM: C = A * B -------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_basic(Index m, Index n, Index k) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, k);
|
||||||
|
Mat B = Mat::Random(k, n);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
// Expression: d_C = d_A * d_B
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
d_C = d_A * d_B;
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
Mat C_ref = A * B;
|
||||||
|
|
||||||
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GEMM with adjoint: C = A^H * B ----------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_adjoint_lhs(Index m, Index n, Index k) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(k, m); // A is k×m, A^H is m×k
|
||||||
|
Mat B = Mat::Random(k, n);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
d_C = d_A.adjoint() * d_B;
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
Mat C_ref = A.adjoint() * B;
|
||||||
|
|
||||||
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GEMM with transpose: C = A * B^T --------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_transpose_rhs(Index m, Index n, Index k) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, k);
|
||||||
|
Mat B = Mat::Random(n, k); // B is n×k, B^T is k×n
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
d_C = d_A * d_B.transpose();
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
Mat C_ref = A * B.transpose();
|
||||||
|
|
||||||
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GEMM with scaled: C = alpha * A * B ------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_scaled(Index m, Index n, Index k) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, k);
|
||||||
|
Mat B = Mat::Random(k, n);
|
||||||
|
Scalar alpha = Scalar(2.5);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
d_C = alpha * d_A * d_B;
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
Mat C_ref = alpha * A * B;
|
||||||
|
|
||||||
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GEMM accumulate: C += A * B (beta=1) -----------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_accumulate(Index m, Index n, Index k) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, k);
|
||||||
|
Mat B = Mat::Random(k, n);
|
||||||
|
Mat C_init = Mat::Random(m, n);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
|
||||||
|
|
||||||
|
d_C += d_A * d_B;
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
Mat C_ref = C_init + A * B;
|
||||||
|
|
||||||
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GEMM accumulate into empty destination ---------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_accumulate_empty(Index m, Index n, Index k) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, k);
|
||||||
|
Mat B = Mat::Random(k, n);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
|
||||||
|
d_C += d_A * d_B;
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
Mat C_ref = A * B;
|
||||||
|
|
||||||
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GEMM subtract: C -= A * B (beta=1, alpha=-1) --------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_subtract(Index m, Index n, Index k) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, k);
|
||||||
|
Mat B = Mat::Random(k, n);
|
||||||
|
Mat C_init = Mat::Random(m, n);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
|
||||||
|
|
||||||
|
GpuContext ctx;
|
||||||
|
d_C.device(ctx) -= d_A * d_B;
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
Mat C_ref = C_init - A * B;
|
||||||
|
|
||||||
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GEMM subtract from empty destination -----------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_subtract_empty(Index m, Index n, Index k) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, k);
|
||||||
|
Mat B = Mat::Random(k, n);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
GpuContext ctx;
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
d_C.device(ctx) -= d_A * d_B;
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
Mat C_ref = -(A * B);
|
||||||
|
|
||||||
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GEMM with scaled RHS: C = A * (alpha * B) -----------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_scaled_rhs(Index m, Index n, Index k) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, k);
|
||||||
|
Mat B = Mat::Random(k, n);
|
||||||
|
Scalar alpha = Scalar(3.0);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
d_C = d_A * (alpha * d_B);
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
Mat C_ref = A * (alpha * B);
|
||||||
|
|
||||||
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GEMM dimension mismatch must assert ------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_dimension_mismatch() {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(8, 5);
|
||||||
|
Mat B = Mat::Random(6, 7); // inner dimension mismatch
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
|
||||||
|
VERIFY_RAISES_ASSERT(d_C = d_A * d_B);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GEMM with explicit GpuContext ------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_explicit_context(Index m, Index n, Index k) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, k);
|
||||||
|
Mat B = Mat::Random(k, n);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
GpuContext ctx;
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
d_C.device(ctx) = d_A * d_B;
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
Mat C_ref = A * B;
|
||||||
|
|
||||||
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), B.norm());
|
||||||
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GEMM cross-context reuse of the same destination -----------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_cross_context_reuse(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat B = Mat::Random(n, n);
|
||||||
|
Mat D = Mat::Random(n, n);
|
||||||
|
Mat E = Mat::Random(n, n);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
|
||||||
|
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
|
||||||
|
|
||||||
|
GpuContext ctx1;
|
||||||
|
GpuContext ctx2;
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
d_C.device(ctx1) = d_A * d_B;
|
||||||
|
d_C.device(ctx2) += d_D * d_E;
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
Mat C_ref = A * B + D * E;
|
||||||
|
|
||||||
|
RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm()) + gemm_error_bound<Scalar>(n, D.norm(), E.norm());
|
||||||
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GEMM cross-context resize of the destination ---------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_cross_context_resize() {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(64, 64);
|
||||||
|
Mat B = Mat::Random(64, 64);
|
||||||
|
Mat D = Mat::Random(32, 16);
|
||||||
|
Mat E = Mat::Random(16, 8);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
|
||||||
|
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
|
||||||
|
|
||||||
|
GpuContext ctx1;
|
||||||
|
GpuContext ctx2;
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
d_C.device(ctx1) = d_A * d_B;
|
||||||
|
d_C.device(ctx2) = d_D * d_E;
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
Mat C_ref = D * E;
|
||||||
|
|
||||||
|
RealScalar tol = gemm_error_bound<Scalar>(16, D.norm(), E.norm());
|
||||||
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GEMM chaining: C = (A * B) then D = C * E -----------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_chain(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat B = Mat::Random(n, n);
|
||||||
|
Mat E = Mat::Random(n, n);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
d_C = d_A * d_B;
|
||||||
|
DeviceMatrix<Scalar> d_D;
|
||||||
|
d_D = d_C * d_E;
|
||||||
|
|
||||||
|
Mat D = d_D.toHost();
|
||||||
|
Mat D_ref = (A * B) * E;
|
||||||
|
|
||||||
|
Mat C_ref = A * B;
|
||||||
|
RealScalar tol =
|
||||||
|
gemm_error_bound<Scalar>(n, A.norm(), B.norm()) * E.norm() + gemm_error_bound<Scalar>(n, C_ref.norm(), E.norm());
|
||||||
|
VERIFY((D - D_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Square identity check: A * I = A ---------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_identity(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat eye = Mat::Identity(n, n);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_I = DeviceMatrix<Scalar>::fromHost(eye);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
d_C = d_A * d_I;
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
VERIFY_IS_APPROX(C, A);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- LLT solve expression: d_X = d_A.llt().solve(d_B) ----------------------
|
||||||
|
|
||||||
|
template <typename MatrixType>
|
||||||
|
MatrixType make_spd(Index n) {
|
||||||
|
using Scalar = typename MatrixType::Scalar;
|
||||||
|
MatrixType M = MatrixType::Random(n, n);
|
||||||
|
return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_llt_solve_expr(Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = make_spd<Mat>(n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_X;
|
||||||
|
d_X = d_A.llt().solve(d_B);
|
||||||
|
|
||||||
|
Mat X = d_X.toHost();
|
||||||
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||||
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- LLT solve with explicit context ----------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_llt_solve_expr_context(Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = make_spd<Mat>(n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
GpuContext ctx;
|
||||||
|
DeviceMatrix<Scalar> d_X;
|
||||||
|
d_X.device(ctx) = d_A.llt().solve(d_B);
|
||||||
|
|
||||||
|
Mat X = d_X.toHost();
|
||||||
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||||
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_lu_solve_expr(Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_X;
|
||||||
|
d_X = d_A.lu().solve(d_B);
|
||||||
|
|
||||||
|
Mat X = d_X.toHost();
|
||||||
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||||
|
VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_gemm_then_solve(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat D = Mat::Random(n, 1);
|
||||||
|
|
||||||
|
// Make SPD: C = A^H * A + n*I
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
d_C = d_A.adjoint() * d_A;
|
||||||
|
|
||||||
|
// Add n*I on host (no element-wise ops on DeviceMatrix yet).
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
C += Mat::Identity(n, n) * static_cast<Scalar>(n);
|
||||||
|
d_C = DeviceMatrix<Scalar>::fromHost(C);
|
||||||
|
|
||||||
|
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_X;
|
||||||
|
d_X = d_C.llt().solve(d_D);
|
||||||
|
|
||||||
|
Mat X = d_X.toHost();
|
||||||
|
RealScalar residual = (C * X - D).norm() / D.norm();
|
||||||
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- LLT solve with Upper triangle -----------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_llt_solve_upper(Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = make_spd<Mat>(n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_X;
|
||||||
|
d_X = d_A.template llt<Upper>().solve(d_B);
|
||||||
|
|
||||||
|
Mat X = d_X.toHost();
|
||||||
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||||
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- LU solve with explicit context -----------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_lu_solve_expr_context(Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
GpuContext ctx;
|
||||||
|
DeviceMatrix<Scalar> d_X;
|
||||||
|
d_X.device(ctx) = d_A.lu().solve(d_B);
|
||||||
|
|
||||||
|
Mat X = d_X.toHost();
|
||||||
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||||
|
VERIFY(residual < RealScalar(10) * RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Zero-nrhs solver expressions ------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_llt_solve_zero_nrhs(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
|
||||||
|
Mat A = make_spd<Mat>(n);
|
||||||
|
Mat B = Mat::Random(n, 0);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_X;
|
||||||
|
d_X = d_A.llt().solve(d_B);
|
||||||
|
|
||||||
|
VERIFY_IS_EQUAL(d_X.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(d_X.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_lu_solve_zero_nrhs(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat B = Mat::Random(n, 0);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_X;
|
||||||
|
d_X = d_A.lu().solve(d_B);
|
||||||
|
|
||||||
|
VERIFY_IS_EQUAL(d_X.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(d_X.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- TRSM: triangularView<UpLo>().solve(B) ----------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar, int UpLo>
|
||||||
|
void test_trsm(Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
// Build a well-conditioned triangular matrix.
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
A.diagonal().array() += static_cast<Scalar>(n); // ensure non-singular
|
||||||
|
if (UpLo == Lower)
|
||||||
|
A = A.template triangularView<Lower>();
|
||||||
|
else
|
||||||
|
A = A.template triangularView<Upper>();
|
||||||
|
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_X;
|
||||||
|
d_X = d_A.template triangularView<UpLo>().solve(d_B);
|
||||||
|
|
||||||
|
Mat X = d_X.toHost();
|
||||||
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||||
|
VERIFY(residual < RealScalar(n) * gpu_unit_roundoff<Scalar>());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SYMM/HEMM: selfadjointView<UpLo>() * B --------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar, int UpLo>
|
||||||
|
void test_symm(Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = make_spd<Mat>(n); // SPD is also self-adjoint
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
d_C = d_A.template selfadjointView<UpLo>() * d_B;
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
Mat C_ref = A * B; // A is symmetric, so full multiply == symm
|
||||||
|
|
||||||
|
RealScalar tol = gemm_error_bound<Scalar>(n, A.norm(), B.norm());
|
||||||
|
VERIFY((C - C_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SYRK/HERK: rankUpdate(A) → C = A * A^H --------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_syrk(Index n, Index k) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, k);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_C;
|
||||||
|
d_C.template selfadjointView<Lower>().rankUpdate(d_A);
|
||||||
|
|
||||||
|
Mat C = d_C.toHost();
|
||||||
|
// Only lower triangle is meaningful for SYRK. Compare lower triangle.
|
||||||
|
Mat C_ref = A * A.adjoint();
|
||||||
|
|
||||||
|
// Extract lower triangle for comparison.
|
||||||
|
Mat C_lower = C.template triangularView<Lower>();
|
||||||
|
Mat C_ref_lower = C_ref.template triangularView<Lower>();
|
||||||
|
|
||||||
|
RealScalar tol = gemm_error_bound<Scalar>(k, A.norm(), A.norm());
|
||||||
|
VERIFY((C_lower - C_ref_lower).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST(test_gemm_basic<Scalar>(64, 64, 64));
|
||||||
|
CALL_SUBTEST(test_gemm_basic<Scalar>(128, 64, 32));
|
||||||
|
CALL_SUBTEST(test_gemm_basic<Scalar>(1, 1, 1));
|
||||||
|
CALL_SUBTEST(test_gemm_basic<Scalar>(256, 256, 256));
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(64, 64, 64));
|
||||||
|
CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(128, 32, 64));
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(64, 64, 64));
|
||||||
|
CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(128, 32, 64));
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_gemm_scaled<Scalar>(64, 64, 64));
|
||||||
|
CALL_SUBTEST(test_gemm_scaled_rhs<Scalar>(64, 64, 64));
|
||||||
|
CALL_SUBTEST(test_gemm_accumulate<Scalar>(64, 64, 64));
|
||||||
|
CALL_SUBTEST(test_gemm_accumulate_empty<Scalar>(64, 64, 64));
|
||||||
|
CALL_SUBTEST(test_gemm_subtract<Scalar>(64, 64, 64));
|
||||||
|
CALL_SUBTEST(test_gemm_subtract_empty<Scalar>(64, 64, 64));
|
||||||
|
CALL_SUBTEST(test_gemm_dimension_mismatch<Scalar>());
|
||||||
|
CALL_SUBTEST(test_gemm_explicit_context<Scalar>(64, 64, 64));
|
||||||
|
CALL_SUBTEST(test_gemm_cross_context_reuse<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_gemm_cross_context_resize<Scalar>());
|
||||||
|
CALL_SUBTEST(test_gemm_chain<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_gemm_identity<Scalar>(64));
|
||||||
|
|
||||||
|
// Solver expressions — zero-size edge cases (use dedicated tests, not residual-based)
|
||||||
|
|
||||||
|
// Solver expressions
|
||||||
|
CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 1));
|
||||||
|
CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_llt_solve_expr<Scalar>(256, 8));
|
||||||
|
CALL_SUBTEST(test_llt_solve_expr_context<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_llt_solve_upper<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 1));
|
||||||
|
CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_lu_solve_expr<Scalar>(256, 8));
|
||||||
|
CALL_SUBTEST(test_lu_solve_expr_context<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(0));
|
||||||
|
CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(0));
|
||||||
|
CALL_SUBTEST(test_gemm_then_solve<Scalar>(64));
|
||||||
|
|
||||||
|
// TRSM
|
||||||
|
CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 1)));
|
||||||
|
CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 4)));
|
||||||
|
CALL_SUBTEST((test_trsm<Scalar, Upper>(64, 4)));
|
||||||
|
CALL_SUBTEST((test_trsm<Scalar, Lower>(256, 8)));
|
||||||
|
|
||||||
|
// SYMM/HEMM
|
||||||
|
CALL_SUBTEST((test_symm<Scalar, Lower>(64, 4)));
|
||||||
|
CALL_SUBTEST((test_symm<Scalar, Upper>(64, 4)));
|
||||||
|
CALL_SUBTEST((test_symm<Scalar, Lower>(128, 8)));
|
||||||
|
|
||||||
|
// SYRK/HERK
|
||||||
|
CALL_SUBTEST(test_syrk<Scalar>(64, 64));
|
||||||
|
CALL_SUBTEST(test_syrk<Scalar>(64, 32));
|
||||||
|
CALL_SUBTEST(test_syrk<Scalar>(128, 64));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solver failure mode tests (not templated on Scalar) --------------------
|
||||||
|
|
||||||
|
void test_llt_not_spd() {
|
||||||
|
// Negative definite matrix — LLT factorization must fail.
|
||||||
|
MatrixXd A = -MatrixXd::Identity(8, 8);
|
||||||
|
MatrixXd B = MatrixXd::Random(8, 1);
|
||||||
|
auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<double>::fromHost(B);
|
||||||
|
DeviceMatrix<double> d_X;
|
||||||
|
VERIFY_RAISES_ASSERT(d_X = d_A.llt().solve(d_B));
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_lu_singular() {
|
||||||
|
// Zero matrix — LU factorization must detect singularity.
|
||||||
|
MatrixXd A = MatrixXd::Zero(8, 8);
|
||||||
|
MatrixXd B = MatrixXd::Random(8, 1);
|
||||||
|
auto d_A = DeviceMatrix<double>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<double>::fromHost(B);
|
||||||
|
DeviceMatrix<double> d_X;
|
||||||
|
VERIFY_RAISES_ASSERT(d_X = d_A.lu().solve(d_B));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cublas) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_llt_not_spd());
|
||||||
|
CALL_SUBTEST(test_lu_singular());
|
||||||
|
}
|
||||||
154
test/gpu_cudss_ldlt.cpp
Normal file
154
test/gpu_cudss_ldlt.cpp
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuSparseLDLT: GPU sparse LDL^T via cuDSS.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Helper: build a random sparse symmetric indefinite matrix ---------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
SparseMatrix<Scalar, ColMajor, int> make_symmetric_indefinite(Index n, double density = 0.1) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
|
||||||
|
// Build a random sparse matrix and symmetrize it.
|
||||||
|
// The diagonal has mixed signs to ensure indefiniteness.
|
||||||
|
SpMat R(n, n);
|
||||||
|
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
|
||||||
|
for (Index j = 0; j < n; ++j) {
|
||||||
|
for (Index i = 0; i < n; ++i) {
|
||||||
|
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
|
||||||
|
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
R.makeCompressed();
|
||||||
|
|
||||||
|
// A = R + R^H (symmetric), then add diagonal with alternating signs for indefiniteness.
|
||||||
|
SpMat A = R + SparseMatrix<Scalar, ColMajor, int>(R.adjoint());
|
||||||
|
for (Index i = 0; i < n; ++i) {
|
||||||
|
Scalar diag_val = Scalar((i % 2 == 0) ? n : -n);
|
||||||
|
A.coeffRef(i, i) += diag_val;
|
||||||
|
}
|
||||||
|
A.makeCompressed();
|
||||||
|
return A;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve and check residual -----------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_solve(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_symmetric_indefinite<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLDLT<Scalar> ldlt(A);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||||
|
|
||||||
|
Vec x = ldlt.solve(b);
|
||||||
|
VERIFY_IS_EQUAL(x.rows(), n);
|
||||||
|
|
||||||
|
Vec r = A * x - b;
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(r.norm() / b.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Multiple RHS -----------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_multiple_rhs(Index n, Index nrhs) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_symmetric_indefinite<Scalar>(n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
GpuSparseLDLT<Scalar> ldlt(A);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||||
|
|
||||||
|
Mat X = ldlt.solve(B);
|
||||||
|
VERIFY_IS_EQUAL(X.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||||
|
|
||||||
|
Mat R = A * X - B;
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(R.norm() / B.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Refactorize ------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_refactorize(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_symmetric_indefinite<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLDLT<Scalar> ldlt;
|
||||||
|
ldlt.analyzePattern(A);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||||
|
|
||||||
|
ldlt.factorize(A);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||||
|
Vec x1 = ldlt.solve(b);
|
||||||
|
|
||||||
|
// Modify values, keep pattern.
|
||||||
|
SpMat A2 = A;
|
||||||
|
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
|
||||||
|
|
||||||
|
ldlt.factorize(A2);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||||
|
Vec x2 = ldlt.solve(b);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((A * x1 - b).norm() / b.norm() < tol);
|
||||||
|
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
|
||||||
|
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty ------------------------------------------------------------------
|
||||||
|
|
||||||
|
void test_empty() {
|
||||||
|
using SpMat = SparseMatrix<double, ColMajor, int>;
|
||||||
|
SpMat A(0, 0);
|
||||||
|
A.makeCompressed();
|
||||||
|
GpuSparseLDLT<double> ldlt(A);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.info(), Success);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(ldlt.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST(test_solve<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_solve<Scalar>(256));
|
||||||
|
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_refactorize<Scalar>(64));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cudss_ldlt) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_empty());
|
||||||
|
}
|
||||||
202
test/gpu_cudss_llt.cpp
Normal file
202
test/gpu_cudss_llt.cpp
Normal file
@@ -0,0 +1,202 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuSparseLLT: GPU sparse Cholesky via cuDSS.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Helper: build a random sparse SPD matrix -------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
// Uses the global std::rand state seeded by the test framework (g_seed).
|
||||||
|
SpMat R(n, n);
|
||||||
|
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
|
||||||
|
for (Index j = 0; j < n; ++j) {
|
||||||
|
for (Index i = 0; i < n; ++i) {
|
||||||
|
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
|
||||||
|
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
R.makeCompressed();
|
||||||
|
|
||||||
|
// A = R^H * R + n * I (guaranteed SPD).
|
||||||
|
SpMat A = R.adjoint() * R;
|
||||||
|
for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
|
||||||
|
A.makeCompressed();
|
||||||
|
return A;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve and check residual -----------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_solve(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_spd<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLLT<Scalar> llt(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
|
||||||
|
Vec x = llt.solve(b);
|
||||||
|
VERIFY_IS_EQUAL(x.rows(), n);
|
||||||
|
|
||||||
|
// Check residual: ||Ax - b|| / ||b||.
|
||||||
|
Vec r = A * x - b;
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(r.norm() / b.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Compare with CPU SimplicialLLT -----------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_vs_cpu(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_spd<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLLT<Scalar> gpu_llt(A);
|
||||||
|
VERIFY_IS_EQUAL(gpu_llt.info(), Success);
|
||||||
|
Vec x_gpu = gpu_llt.solve(b);
|
||||||
|
|
||||||
|
SimplicialLLT<SpMat> cpu_llt(A);
|
||||||
|
VERIFY_IS_EQUAL(cpu_llt.info(), Success);
|
||||||
|
Vec x_cpu = cpu_llt.solve(b);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((x_gpu - x_cpu).norm() / x_cpu.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Multiple RHS -----------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_multiple_rhs(Index n, Index nrhs) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_spd<Scalar>(n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
GpuSparseLLT<Scalar> llt(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
|
||||||
|
Mat X = llt.solve(B);
|
||||||
|
VERIFY_IS_EQUAL(X.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||||
|
|
||||||
|
Mat R = A * X - B;
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(R.norm() / B.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Separate analyze + factorize (refactorization) -------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_refactorize(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_spd<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLLT<Scalar> llt;
|
||||||
|
llt.analyzePattern(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
|
||||||
|
// First factorize + solve.
|
||||||
|
llt.factorize(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
Vec x1 = llt.solve(b);
|
||||||
|
|
||||||
|
// Modify values (keep same pattern): scale diagonal.
|
||||||
|
SpMat A2 = A;
|
||||||
|
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
|
||||||
|
|
||||||
|
// Refactorize with same pattern.
|
||||||
|
llt.factorize(A2);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
Vec x2 = llt.solve(b);
|
||||||
|
|
||||||
|
// Both solutions should satisfy their respective systems.
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((A * x1 - b).norm() / b.norm() < tol);
|
||||||
|
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
|
||||||
|
|
||||||
|
// Solutions should differ (A2 != A).
|
||||||
|
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty matrix -----------------------------------------------------------
|
||||||
|
|
||||||
|
void test_empty() {
|
||||||
|
using SpMat = SparseMatrix<double, ColMajor, int>;
|
||||||
|
SpMat A(0, 0);
|
||||||
|
A.makeCompressed();
|
||||||
|
GpuSparseLLT<double> llt(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
VERIFY_IS_EQUAL(llt.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(llt.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Upper triangle ---------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_upper(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_spd<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLLT<Scalar, Upper> llt(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
|
||||||
|
Vec x = llt.solve(b);
|
||||||
|
Vec r = A * x - b;
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(r.norm() / b.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST(test_solve<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_solve<Scalar>(256));
|
||||||
|
CALL_SUBTEST(test_vs_cpu<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_refactorize<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_upper<Scalar>(64));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cudss_llt) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_empty());
|
||||||
|
}
|
||||||
147
test/gpu_cudss_lu.cpp
Normal file
147
test/gpu_cudss_lu.cpp
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuSparseLU: GPU sparse LU via cuDSS.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Helper: build a random sparse non-singular general matrix ---------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
SparseMatrix<Scalar, ColMajor, int> make_general(Index n, double density = 0.1) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat R(n, n);
|
||||||
|
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
|
||||||
|
for (Index j = 0; j < n; ++j) {
|
||||||
|
for (Index i = 0; i < n; ++i) {
|
||||||
|
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
|
||||||
|
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Add strong diagonal for non-singularity.
|
||||||
|
for (Index i = 0; i < n; ++i) R.coeffRef(i, i) += Scalar(RealScalar(n));
|
||||||
|
R.makeCompressed();
|
||||||
|
return R;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve and check residual -----------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_solve(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_general<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLU<Scalar> lu(A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
|
||||||
|
Vec x = lu.solve(b);
|
||||||
|
VERIFY_IS_EQUAL(x.rows(), n);
|
||||||
|
|
||||||
|
Vec r = A * x - b;
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(r.norm() / b.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Multiple RHS -----------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_multiple_rhs(Index n, Index nrhs) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_general<Scalar>(n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
GpuSparseLU<Scalar> lu(A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
|
||||||
|
Mat X = lu.solve(B);
|
||||||
|
VERIFY_IS_EQUAL(X.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||||
|
|
||||||
|
Mat R = A * X - B;
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(R.norm() / B.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Refactorize ------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_refactorize(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_general<Scalar>(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseLU<Scalar> lu;
|
||||||
|
lu.analyzePattern(A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
|
||||||
|
lu.factorize(A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
Vec x1 = lu.solve(b);
|
||||||
|
|
||||||
|
// Modify values, keep pattern.
|
||||||
|
SpMat A2 = A;
|
||||||
|
for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
|
||||||
|
|
||||||
|
lu.factorize(A2);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
Vec x2 = lu.solve(b);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((A * x1 - b).norm() / b.norm() < tol);
|
||||||
|
VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
|
||||||
|
VERIFY((x1 - x2).norm() > NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty ------------------------------------------------------------------
|
||||||
|
|
||||||
|
void test_empty() {
|
||||||
|
using SpMat = SparseMatrix<double, ColMajor, int>;
|
||||||
|
SpMat A(0, 0);
|
||||||
|
A.makeCompressed();
|
||||||
|
GpuSparseLU<double> lu(A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
VERIFY_IS_EQUAL(lu.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(lu.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST(test_solve<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_solve<Scalar>(256));
|
||||||
|
CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_refactorize<Scalar>(64));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cudss_lu) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_empty());
|
||||||
|
}
|
||||||
186
test/gpu_cufft.cpp
Normal file
186
test/gpu_cufft.cpp
Normal file
@@ -0,0 +1,186 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuFFT: GPU FFT via cuFFT.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- 1D C2C roundtrip: inv(fwd(x)) ≈ x -------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_c2c_roundtrip(Index n) {
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using Vec = Matrix<Complex, Dynamic, 1>;
|
||||||
|
using RealScalar = Scalar;
|
||||||
|
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
Vec X = fft.fwd(x);
|
||||||
|
VERIFY_IS_EQUAL(X.size(), n);
|
||||||
|
|
||||||
|
Vec y = fft.inv(X);
|
||||||
|
VERIFY_IS_EQUAL(y.size(), n);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((y - x).norm() / x.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 1D C2C known signal: FFT of constant = delta --------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_c2c_constant() {
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using Vec = Matrix<Complex, Dynamic, 1>;
|
||||||
|
using RealScalar = Scalar;
|
||||||
|
|
||||||
|
const int n = 64;
|
||||||
|
Vec x = Vec::Constant(n, Complex(3.0, 0.0));
|
||||||
|
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
Vec X = fft.fwd(x);
|
||||||
|
|
||||||
|
// FFT of constant c: X[0] = c*n, X[k] = 0 for k > 0.
|
||||||
|
RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(n);
|
||||||
|
VERIFY(std::abs(X(0) - Complex(3.0 * n, 0.0)) < tol);
|
||||||
|
for (int k = 1; k < n; ++k) {
|
||||||
|
VERIFY(std::abs(X(k)) < tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 1D R2C/C2R roundtrip: invReal(fwd(r), n) ≈ r --------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_r2c_roundtrip(Index n) {
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using CVec = Matrix<Complex, Dynamic, 1>;
|
||||||
|
using RVec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = Scalar;
|
||||||
|
|
||||||
|
RVec r = RVec::Random(n);
|
||||||
|
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
CVec R = fft.fwd(r);
|
||||||
|
|
||||||
|
// R2C returns n/2+1 complex values.
|
||||||
|
VERIFY_IS_EQUAL(R.size(), n / 2 + 1);
|
||||||
|
|
||||||
|
RVec s = fft.invReal(R, n);
|
||||||
|
VERIFY_IS_EQUAL(s.size(), n);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((s - r).norm() / r.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 2D C2C roundtrip: inv2d(fwd2d(A)) ≈ A ---------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_2d_roundtrip(Index rows, Index cols) {
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using Mat = Matrix<Complex, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = Scalar;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(rows, cols);
|
||||||
|
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
Mat B = fft.fwd2d(A);
|
||||||
|
VERIFY_IS_EQUAL(B.rows(), rows);
|
||||||
|
VERIFY_IS_EQUAL(B.cols(), cols);
|
||||||
|
|
||||||
|
Mat C = fft.inv2d(B);
|
||||||
|
VERIFY_IS_EQUAL(C.rows(), rows);
|
||||||
|
VERIFY_IS_EQUAL(C.cols(), cols);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(rows * cols) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((C - A).norm() / A.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- 2D C2C known signal: constant matrix -----------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_2d_constant() {
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using Mat = Matrix<Complex, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = Scalar;
|
||||||
|
|
||||||
|
const int rows = 16, cols = 32;
|
||||||
|
Mat A = Mat::Constant(rows, cols, Complex(2.0, 0.0));
|
||||||
|
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
Mat B = fft.fwd2d(A);
|
||||||
|
|
||||||
|
// 2D FFT of constant c: B(0,0) = c*rows*cols, all others = 0.
|
||||||
|
RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(rows * cols);
|
||||||
|
VERIFY(std::abs(B(0, 0) - Complex(2.0 * rows * cols, 0.0)) < tol);
|
||||||
|
for (int j = 0; j < cols; ++j) {
|
||||||
|
for (int i = 0; i < rows; ++i) {
|
||||||
|
if (i == 0 && j == 0) continue;
|
||||||
|
VERIFY(std::abs(B(i, j)) < tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Plan reuse: repeated calls should work ---------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_plan_reuse() {
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using Vec = Matrix<Complex, Dynamic, 1>;
|
||||||
|
using RealScalar = Scalar;
|
||||||
|
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
for (int trial = 0; trial < 5; ++trial) {
|
||||||
|
Vec x = Vec::Random(128);
|
||||||
|
Vec X = fft.fwd(x);
|
||||||
|
Vec y = fft.inv(X);
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(128) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((y - x).norm() / x.norm() < tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty ------------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_empty() {
|
||||||
|
using Complex = std::complex<Scalar>;
|
||||||
|
using Vec = Matrix<Complex, Dynamic, 1>;
|
||||||
|
|
||||||
|
GpuFFT<Scalar> fft;
|
||||||
|
Vec x(0);
|
||||||
|
Vec X = fft.fwd(x);
|
||||||
|
VERIFY_IS_EQUAL(X.size(), 0);
|
||||||
|
Vec y = fft.inv(X);
|
||||||
|
VERIFY_IS_EQUAL(y.size(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(256));
|
||||||
|
CALL_SUBTEST(test_c2c_roundtrip<Scalar>(1000)); // non-power-of-2
|
||||||
|
CALL_SUBTEST(test_c2c_constant<Scalar>());
|
||||||
|
CALL_SUBTEST(test_r2c_roundtrip<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_r2c_roundtrip<Scalar>(256));
|
||||||
|
CALL_SUBTEST(test_2d_roundtrip<Scalar>(32, 32));
|
||||||
|
CALL_SUBTEST(test_2d_roundtrip<Scalar>(16, 64)); // non-square
|
||||||
|
CALL_SUBTEST(test_2d_constant<Scalar>());
|
||||||
|
CALL_SUBTEST(test_plan_reuse<Scalar>());
|
||||||
|
CALL_SUBTEST(test_empty<Scalar>());
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cufft) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
}
|
||||||
180
test/gpu_cusolver_eigen.cpp
Normal file
180
test/gpu_cusolver_eigen.cpp
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuSelfAdjointEigenSolver: GPU symmetric/Hermitian eigenvalue
|
||||||
|
// decomposition via cuSOLVER.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/Eigenvalues>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Reconstruction: V * diag(W) * V^H ≈ A ---------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_eigen_reconstruction(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
// Build a symmetric/Hermitian matrix.
|
||||||
|
Mat R = Mat::Random(n, n);
|
||||||
|
Mat A = R + R.adjoint();
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver<Scalar> es(A);
|
||||||
|
VERIFY_IS_EQUAL(es.info(), Success);
|
||||||
|
|
||||||
|
auto W = es.eigenvalues();
|
||||||
|
Mat V = es.eigenvectors();
|
||||||
|
|
||||||
|
VERIFY_IS_EQUAL(W.size(), n);
|
||||||
|
VERIFY_IS_EQUAL(V.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(V.cols(), n);
|
||||||
|
|
||||||
|
// Reconstruct: A_hat = V * diag(W) * V^H.
|
||||||
|
Mat A_hat = V * W.asDiagonal() * V.adjoint();
|
||||||
|
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||||
|
VERIFY((A_hat - A).norm() < tol);
|
||||||
|
|
||||||
|
// Orthogonality: V^H * V ≈ I.
|
||||||
|
Mat VhV = V.adjoint() * V;
|
||||||
|
Mat eye = Mat::Identity(n, n);
|
||||||
|
VERIFY((VhV - eye).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Eigenvalues match CPU SelfAdjointEigenSolver ---------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_eigen_values(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat R = Mat::Random(n, n);
|
||||||
|
Mat A = R + R.adjoint();
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver<Scalar> gpu_es(A);
|
||||||
|
VERIFY_IS_EQUAL(gpu_es.info(), Success);
|
||||||
|
auto W_gpu = gpu_es.eigenvalues();
|
||||||
|
|
||||||
|
SelfAdjointEigenSolver<Mat> cpu_es(A);
|
||||||
|
auto W_cpu = cpu_es.eigenvalues();
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() *
|
||||||
|
W_cpu.cwiseAbs().maxCoeff();
|
||||||
|
VERIFY((W_gpu - W_cpu).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Eigenvalues-only mode --------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_eigen_values_only(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat R = Mat::Random(n, n);
|
||||||
|
Mat A = R + R.adjoint();
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver<Scalar> gpu_es(A, GpuSelfAdjointEigenSolver<Scalar>::EigenvaluesOnly);
|
||||||
|
VERIFY_IS_EQUAL(gpu_es.info(), Success);
|
||||||
|
auto W_gpu = gpu_es.eigenvalues();
|
||||||
|
|
||||||
|
SelfAdjointEigenSolver<Mat> cpu_es(A, EigenvaluesOnly);
|
||||||
|
auto W_cpu = cpu_es.eigenvalues();
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() *
|
||||||
|
W_cpu.cwiseAbs().maxCoeff();
|
||||||
|
VERIFY((W_gpu - W_cpu).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- DeviceMatrix input path ------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_eigen_device_matrix(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat R = Mat::Random(n, n);
|
||||||
|
Mat A = R + R.adjoint();
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
GpuSelfAdjointEigenSolver<Scalar> es;
|
||||||
|
es.compute(d_A);
|
||||||
|
VERIFY_IS_EQUAL(es.info(), Success);
|
||||||
|
|
||||||
|
auto W_gpu = es.eigenvalues();
|
||||||
|
Mat V = es.eigenvectors();
|
||||||
|
|
||||||
|
// Verify reconstruction.
|
||||||
|
Mat A_hat = V * W_gpu.asDiagonal() * V.adjoint();
|
||||||
|
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||||
|
VERIFY((A_hat - A).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Recompute (reuse solver object) ----------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_eigen_recompute(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
GpuSelfAdjointEigenSolver<Scalar> es;
|
||||||
|
|
||||||
|
for (int trial = 0; trial < 3; ++trial) {
|
||||||
|
Mat R = Mat::Random(n, n);
|
||||||
|
Mat A = R + R.adjoint();
|
||||||
|
es.compute(A);
|
||||||
|
VERIFY_IS_EQUAL(es.info(), Success);
|
||||||
|
|
||||||
|
auto W = es.eigenvalues();
|
||||||
|
Mat V = es.eigenvectors();
|
||||||
|
Mat A_hat = V * W.asDiagonal() * V.adjoint();
|
||||||
|
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(n)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||||
|
VERIFY((A_hat - A).norm() < tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty matrix -----------------------------------------------------------
|
||||||
|
|
||||||
|
void test_eigen_empty() {
|
||||||
|
GpuSelfAdjointEigenSolver<double> es(MatrixXd(0, 0));
|
||||||
|
VERIFY_IS_EQUAL(es.info(), Success);
|
||||||
|
VERIFY_IS_EQUAL(es.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(es.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
// Reconstruction + orthogonality.
|
||||||
|
CALL_SUBTEST(test_eigen_reconstruction<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_eigen_reconstruction<Scalar>(128));
|
||||||
|
|
||||||
|
// Eigenvalues match CPU.
|
||||||
|
CALL_SUBTEST(test_eigen_values<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_eigen_values<Scalar>(128));
|
||||||
|
|
||||||
|
// Values-only mode.
|
||||||
|
CALL_SUBTEST(test_eigen_values_only<Scalar>(64));
|
||||||
|
|
||||||
|
// DeviceMatrix input.
|
||||||
|
CALL_SUBTEST(test_eigen_device_matrix<Scalar>(64));
|
||||||
|
|
||||||
|
// Recompute.
|
||||||
|
CALL_SUBTEST(test_eigen_recompute<Scalar>(32));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cusolver_eigen) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_eigen_empty());
|
||||||
|
}
|
||||||
210
test/gpu_cusolver_llt.cpp
Normal file
210
test/gpu_cusolver_llt.cpp
Normal file
@@ -0,0 +1,210 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Eigen Authors
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuLLT: GPU Cholesky (LL^T) using cuSOLVER.
|
||||||
|
// Covers cusolverDnXpotrf (factorization) and cusolverDnXpotrs (solve)
|
||||||
|
// for float, double, complex<float>, complex<double>, Lower and Upper.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/Cholesky>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// Build a random symmetric positive-definite matrix: A = M^H*M + n*I.
|
||||||
|
template <typename MatrixType>
|
||||||
|
MatrixType make_spd(Index n) {
|
||||||
|
using Scalar = typename MatrixType::Scalar;
|
||||||
|
MatrixType M = MatrixType::Random(n, n);
|
||||||
|
return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test factorization: L*L^H must reconstruct A to within floating-point tolerance.
|
||||||
|
template <typename Scalar, int UpLo>
|
||||||
|
void test_potrf(Index n) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
MatrixType A = make_spd<MatrixType>(n);
|
||||||
|
|
||||||
|
GpuLLT<Scalar, UpLo> llt(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
|
||||||
|
// Reconstruct L*L^H and compare to original A.
|
||||||
|
// GpuLLT stores the factor on device; use CPU LLT to get the triangular factor
|
||||||
|
// for reconstruction since GpuLLT does not expose the device-resident factor directly.
|
||||||
|
LLT<MatrixType, UpLo> ref(A);
|
||||||
|
VERIFY_IS_EQUAL(ref.info(), Success);
|
||||||
|
MatrixType A_reconstructed = ref.reconstructedMatrix();
|
||||||
|
|
||||||
|
// Both should equal A to within n*eps*||A||.
|
||||||
|
RealScalar tol = RealScalar(4) * RealScalar(n) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||||
|
VERIFY((A_reconstructed - A).norm() < tol);
|
||||||
|
|
||||||
|
// Smoke-test: llt.solve(b) should return the same result as ref.solve(b).
|
||||||
|
MatrixType b = MatrixType::Random(n, 1);
|
||||||
|
MatrixType x_gpu = llt.solve(b);
|
||||||
|
MatrixType x_cpu = ref.solve(b);
|
||||||
|
VERIFY((x_gpu - x_cpu).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test solve: residual ||A*X - B|| / ||B|| must be small.
|
||||||
|
template <typename Scalar, int UpLo>
|
||||||
|
void test_potrs(Index n, Index nrhs) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
MatrixType A = make_spd<MatrixType>(n);
|
||||||
|
MatrixType B = MatrixType::Random(n, nrhs);
|
||||||
|
|
||||||
|
GpuLLT<Scalar, UpLo> llt(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
|
||||||
|
MatrixType X = llt.solve(B);
|
||||||
|
|
||||||
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||||
|
RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(residual < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that multiple solves against the same factor all produce correct results.
|
||||||
|
// This exercises the key design property: L stays on device across calls.
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_multiple_solves(Index n) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
MatrixType A = make_spd<MatrixType>(n);
|
||||||
|
GpuLLT<Scalar, Lower> llt(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
for (int k = 0; k < 5; ++k) {
|
||||||
|
MatrixType B = MatrixType::Random(n, 3);
|
||||||
|
MatrixType X = llt.solve(B);
|
||||||
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||||
|
VERIFY(residual < tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that GpuLLT correctly detects a non-SPD matrix.
|
||||||
|
void test_not_spd() {
|
||||||
|
MatrixXd A = -MatrixXd::Identity(8, 8); // negative definite
|
||||||
|
GpuLLT<double> llt(A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), NumericalIssue);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- DeviceMatrix integration tests -----------------------------------------
|
||||||
|
|
||||||
|
// compute(DeviceMatrix) + solve(DeviceMatrix) → toHost
|
||||||
|
template <typename Scalar, int UpLo>
|
||||||
|
void test_device_matrix_solve(Index n, Index nrhs) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
MatrixType A = make_spd<MatrixType>(n);
|
||||||
|
MatrixType B = MatrixType::Random(n, nrhs);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
GpuLLT<Scalar, UpLo> llt;
|
||||||
|
llt.compute(d_A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||||
|
MatrixType X = d_X.toHost();
|
||||||
|
|
||||||
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||||
|
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
// compute(DeviceMatrix&&) — move path
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_device_matrix_move_compute(Index n) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
MatrixType A = make_spd<MatrixType>(n);
|
||||||
|
MatrixType B = MatrixType::Random(n, 1);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
GpuLLT<Scalar, Lower> llt;
|
||||||
|
llt.compute(std::move(d_A));
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
|
||||||
|
// d_A should be empty after move.
|
||||||
|
VERIFY(d_A.empty());
|
||||||
|
|
||||||
|
MatrixType X = llt.solve(B);
|
||||||
|
RealScalar residual = (A * X - B).norm() / B.norm();
|
||||||
|
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Full async chain: compute → solve → solve again with result as RHS → toHost
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_chaining(Index n) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
MatrixType A = make_spd<MatrixType>(n);
|
||||||
|
MatrixType B = MatrixType::Random(n, 3);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
GpuLLT<Scalar, Lower> llt;
|
||||||
|
llt.compute(d_A);
|
||||||
|
VERIFY_IS_EQUAL(llt.info(), Success);
|
||||||
|
|
||||||
|
// Chain: solve → use result as RHS for another solve
|
||||||
|
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||||||
|
DeviceMatrix<Scalar> d_Y = llt.solve(d_X);
|
||||||
|
|
||||||
|
// Only sync at the very end.
|
||||||
|
MatrixType Y = d_Y.toHost();
|
||||||
|
|
||||||
|
// Verify: Y = A^{-2} * B
|
||||||
|
MatrixType X_ref = LLT<MatrixType, Lower>(A).solve(B);
|
||||||
|
MatrixType Y_ref = LLT<MatrixType, Lower>(A).solve(X_ref);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(4) * RealScalar(n) * NumTraits<Scalar>::epsilon() * Y_ref.norm();
|
||||||
|
VERIFY((Y - Y_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST((test_potrf<Scalar, Lower>(1)));
|
||||||
|
CALL_SUBTEST((test_potrf<Scalar, Lower>(64)));
|
||||||
|
CALL_SUBTEST((test_potrf<Scalar, Lower>(256)));
|
||||||
|
CALL_SUBTEST((test_potrf<Scalar, Upper>(64)));
|
||||||
|
CALL_SUBTEST((test_potrf<Scalar, Upper>(256)));
|
||||||
|
|
||||||
|
CALL_SUBTEST((test_potrs<Scalar, Lower>(64, 1)));
|
||||||
|
CALL_SUBTEST((test_potrs<Scalar, Lower>(64, 4)));
|
||||||
|
CALL_SUBTEST((test_potrs<Scalar, Lower>(256, 8)));
|
||||||
|
CALL_SUBTEST((test_potrs<Scalar, Upper>(64, 1)));
|
||||||
|
CALL_SUBTEST((test_potrs<Scalar, Upper>(256, 4)));
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_multiple_solves<Scalar>(128));
|
||||||
|
|
||||||
|
CALL_SUBTEST((test_device_matrix_solve<Scalar, Lower>(64, 4)));
|
||||||
|
CALL_SUBTEST((test_device_matrix_solve<Scalar, Upper>(128, 1)));
|
||||||
|
CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_chaining<Scalar>(64));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cusolver_llt) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_not_spd());
|
||||||
|
}
|
||||||
206
test/gpu_cusolver_lu.cpp
Normal file
206
test/gpu_cusolver_lu.cpp
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Eigen Authors
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuLU: GPU partial-pivoting LU decomposition via cuSOLVER.
|
||||||
|
// Covers cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve)
|
||||||
|
// for float, double, complex<float>, complex<double>.
|
||||||
|
//
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/LU>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Test factorization + NoTrans solve: residual ||A*X - B|| / ||B|| -------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_getrf(Index n) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
MatrixType A = MatrixType::Random(n, n);
|
||||||
|
MatrixType B = MatrixType::Random(n, 4);
|
||||||
|
|
||||||
|
GpuLU<Scalar> lu(A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
|
||||||
|
MatrixType X = lu.solve(B);
|
||||||
|
// Backward error bound for LU: ||A*X - B|| <= O(n*u) * ||A|| * ||X||.
|
||||||
|
// Normalize by ||A||*||X|| rather than ||B|| to be condition-number agnostic.
|
||||||
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||||
|
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Test solve: A^T*X = B and A^H*X = B ------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_getrs_trans(Index n) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
MatrixType A = MatrixType::Random(n, n);
|
||||||
|
MatrixType B = MatrixType::Random(n, 3);
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
|
||||||
|
GpuLU<Scalar> lu(A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
|
||||||
|
MatrixType Xt = lu.solve(B, GpuLU<Scalar>::Transpose);
|
||||||
|
VERIFY((A.transpose() * Xt - B).norm() / (A.norm() * Xt.norm()) < tol);
|
||||||
|
|
||||||
|
MatrixType Xc = lu.solve(B, GpuLU<Scalar>::ConjugateTranspose);
|
||||||
|
VERIFY((A.adjoint() * Xc - B).norm() / (A.norm() * Xc.norm()) < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Test multiple solves reuse the device-resident LU ----------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_multiple_solves(Index n) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
MatrixType A = MatrixType::Random(n, n);
|
||||||
|
GpuLU<Scalar> lu(A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
for (int k = 0; k < 5; ++k) {
|
||||||
|
MatrixType B = MatrixType::Random(n, 3);
|
||||||
|
MatrixType X = lu.solve(B);
|
||||||
|
VERIFY((A * X - B).norm() / (A.norm() * X.norm()) < tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Agreement with CPU PartialPivLU ----------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_vs_cpu(Index n) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
MatrixType A = MatrixType::Random(n, n);
|
||||||
|
MatrixType B = MatrixType::Random(n, 5);
|
||||||
|
|
||||||
|
GpuLU<Scalar> gpu_lu(A);
|
||||||
|
VERIFY_IS_EQUAL(gpu_lu.info(), Success);
|
||||||
|
|
||||||
|
MatrixType X_gpu = gpu_lu.solve(B);
|
||||||
|
MatrixType X_cpu = PartialPivLU<MatrixType>(A).solve(B);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Singular matrix detection ----------------------------------------------
|
||||||
|
|
||||||
|
void test_singular() {
|
||||||
|
MatrixXd A = MatrixXd::Zero(8, 8);
|
||||||
|
GpuLU<double> lu(A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), NumericalIssue);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- DeviceMatrix integration tests -----------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_device_matrix_solve(Index n) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
MatrixType A = MatrixType::Random(n, n);
|
||||||
|
MatrixType B = MatrixType::Random(n, 4);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
GpuLU<Scalar> lu;
|
||||||
|
lu.compute(d_A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_X = lu.solve(d_B);
|
||||||
|
MatrixType X = d_X.toHost();
|
||||||
|
|
||||||
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||||
|
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_device_matrix_move_compute(Index n) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
MatrixType A = MatrixType::Random(n, n);
|
||||||
|
MatrixType B = MatrixType::Random(n, 1);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
GpuLU<Scalar> lu;
|
||||||
|
lu.compute(std::move(d_A));
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
VERIFY(d_A.empty());
|
||||||
|
|
||||||
|
MatrixType X = lu.solve(B);
|
||||||
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||||
|
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_chaining(Index n) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
MatrixType A = MatrixType::Random(n, n);
|
||||||
|
MatrixType B = MatrixType::Random(n, 3);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
GpuLU<Scalar> lu;
|
||||||
|
lu.compute(d_A);
|
||||||
|
VERIFY_IS_EQUAL(lu.info(), Success);
|
||||||
|
|
||||||
|
// Chain: solve → use result as RHS
|
||||||
|
DeviceMatrix<Scalar> d_X = lu.solve(d_B);
|
||||||
|
DeviceMatrix<Scalar> d_Y = lu.solve(d_X);
|
||||||
|
MatrixType Y = d_Y.toHost();
|
||||||
|
|
||||||
|
MatrixType X_ref = PartialPivLU<MatrixType>(A).solve(B);
|
||||||
|
MatrixType Y_ref = PartialPivLU<MatrixType>(A).solve(X_ref);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon() * Y_ref.norm();
|
||||||
|
VERIFY((Y - Y_ref).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver -------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST(test_getrf<Scalar>(1));
|
||||||
|
CALL_SUBTEST(test_getrf<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_getrf<Scalar>(256));
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_getrs_trans<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_getrs_trans<Scalar>(128));
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_multiple_solves<Scalar>(128));
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_vs_cpu<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_vs_cpu<Scalar>(256));
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_device_matrix_solve<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_chaining<Scalar>(64));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cusolver_lu) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_singular());
|
||||||
|
}
|
||||||
185
test/gpu_cusolver_qr.cpp
Normal file
185
test/gpu_cusolver_qr.cpp
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuQR: GPU QR decomposition via cuSOLVER.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/QR>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Solve square system: A * X = B -----------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_qr_solve_square(Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
GpuQR<Scalar> qr(A);
|
||||||
|
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||||
|
|
||||||
|
Mat X = qr.solve(B);
|
||||||
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||||
|
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve overdetermined system: m > n (least-squares) ---------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_qr_solve_overdetermined(Index m, Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
eigen_assert(m >= n);
|
||||||
|
Mat A = Mat::Random(m, n);
|
||||||
|
Mat B = Mat::Random(m, nrhs);
|
||||||
|
|
||||||
|
GpuQR<Scalar> qr(A);
|
||||||
|
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||||
|
|
||||||
|
Mat X = qr.solve(B);
|
||||||
|
VERIFY_IS_EQUAL(X.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||||
|
|
||||||
|
// Compare with CPU QR.
|
||||||
|
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(m) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve with DeviceMatrix input ------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_qr_solve_device(Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
GpuQR<Scalar> qr;
|
||||||
|
qr.compute(d_A);
|
||||||
|
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_X = qr.solve(d_B);
|
||||||
|
Mat X = d_X.toHost();
|
||||||
|
|
||||||
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||||
|
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve overdetermined via device path -----------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_qr_solve_overdetermined_device(Index m, Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
eigen_assert(m >= n);
|
||||||
|
Mat A = Mat::Random(m, n);
|
||||||
|
Mat B = Mat::Random(m, nrhs);
|
||||||
|
|
||||||
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||||||
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||||||
|
|
||||||
|
GpuQR<Scalar> qr;
|
||||||
|
qr.compute(d_A);
|
||||||
|
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||||
|
|
||||||
|
DeviceMatrix<Scalar> d_X = qr.solve(d_B);
|
||||||
|
VERIFY_IS_EQUAL(d_X.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(d_X.cols(), nrhs);
|
||||||
|
|
||||||
|
Mat X = d_X.toHost();
|
||||||
|
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(m) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Multiple solves reuse the factorization --------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_qr_multiple_solves(Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
GpuQR<Scalar> qr(A);
|
||||||
|
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
for (int k = 0; k < 5; ++k) {
|
||||||
|
Mat B = Mat::Random(n, 3);
|
||||||
|
Mat X = qr.solve(B);
|
||||||
|
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
|
||||||
|
VERIFY(residual < tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Agreement with CPU HouseholderQR ---------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_qr_vs_cpu(Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(n, n);
|
||||||
|
Mat B = Mat::Random(n, nrhs);
|
||||||
|
|
||||||
|
GpuQR<Scalar> gpu_qr(A);
|
||||||
|
VERIFY_IS_EQUAL(gpu_qr.info(), Success);
|
||||||
|
|
||||||
|
Mat X_gpu = gpu_qr.solve(B);
|
||||||
|
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST(test_qr_solve_square<Scalar>(1, 1));
|
||||||
|
CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 1));
|
||||||
|
CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_qr_solve_square<Scalar>(256, 8));
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(128, 64, 4));
|
||||||
|
CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(256, 128, 1));
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_qr_solve_device<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_qr_solve_overdetermined_device<Scalar>(128, 64, 4));
|
||||||
|
CALL_SUBTEST(test_qr_multiple_solves<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_qr_vs_cpu<Scalar>(64, 4));
|
||||||
|
CALL_SUBTEST(test_qr_vs_cpu<Scalar>(256, 8));
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_qr_empty() {
|
||||||
|
GpuQR<double> qr(MatrixXd(0, 0));
|
||||||
|
VERIFY_IS_EQUAL(qr.info(), Success);
|
||||||
|
VERIFY_IS_EQUAL(qr.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(qr.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cusolver_qr) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_qr_empty());
|
||||||
|
}
|
||||||
194
test/gpu_cusolver_svd.cpp
Normal file
194
test/gpu_cusolver_svd.cpp
Normal file
@@ -0,0 +1,194 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuSVD: GPU SVD via cuSOLVER.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/SVD>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- SVD reconstruction: U * diag(S) * VT ≈ A ------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar, unsigned int Options>
|
||||||
|
void test_svd_reconstruction(Index m, Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, n);
|
||||||
|
GpuSVD<Scalar> svd(A, Options);
|
||||||
|
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||||
|
|
||||||
|
auto S = svd.singularValues();
|
||||||
|
Mat U = svd.matrixU();
|
||||||
|
Mat VT = svd.matrixVT();
|
||||||
|
|
||||||
|
const Index k = (std::min)(m, n);
|
||||||
|
|
||||||
|
// Reconstruct: A_hat = U[:,:k] * diag(S) * VT[:k,:].
|
||||||
|
Mat A_hat = U.leftCols(k) * S.asDiagonal() * VT.topRows(k);
|
||||||
|
RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(k)) * NumTraits<Scalar>::epsilon() * A.norm();
|
||||||
|
VERIFY((A_hat - A).norm() < tol);
|
||||||
|
|
||||||
|
// Orthogonality: U^H * U ≈ I.
|
||||||
|
Mat UtU = U.adjoint() * U;
|
||||||
|
Mat I_u = Mat::Identity(U.cols(), U.cols());
|
||||||
|
VERIFY((UtU - I_u).norm() < tol);
|
||||||
|
|
||||||
|
// Orthogonality: VT * VT^H ≈ I.
|
||||||
|
Mat VtVh = VT * VT.adjoint();
|
||||||
|
Mat I_v = Mat::Identity(VT.rows(), VT.rows());
|
||||||
|
VERIFY((VtVh - I_v).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Singular values match CPU BDCSVD ---------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_svd_singular_values(Index m, Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, n);
|
||||||
|
GpuSVD<Scalar> svd(A, 0); // values only
|
||||||
|
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||||
|
|
||||||
|
auto S_gpu = svd.singularValues();
|
||||||
|
auto S_cpu = BDCSVD<Mat>(A, 0).singularValues();
|
||||||
|
|
||||||
|
RealScalar tol =
|
||||||
|
RealScalar(5) * std::sqrt(static_cast<RealScalar>((std::min)(m, n))) * NumTraits<Scalar>::epsilon() * S_cpu(0);
|
||||||
|
VERIFY((S_gpu - S_cpu).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve: pseudoinverse ---------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_svd_solve(Index m, Index n, Index nrhs) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, n);
|
||||||
|
Mat B = Mat::Random(m, nrhs);
|
||||||
|
|
||||||
|
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
|
||||||
|
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||||
|
|
||||||
|
Mat X = svd.solve(B);
|
||||||
|
VERIFY_IS_EQUAL(X.rows(), n);
|
||||||
|
VERIFY_IS_EQUAL(X.cols(), nrhs);
|
||||||
|
|
||||||
|
// Compare with CPU BDCSVD solve.
|
||||||
|
Mat X_cpu = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV).solve(B);
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar((std::max)(m, n)) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve: truncated -------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_svd_solve_truncated(Index m, Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, n);
|
||||||
|
Mat B = Mat::Random(m, 1);
|
||||||
|
const Index k = (std::min)(m, n);
|
||||||
|
const Index trunc = k / 2;
|
||||||
|
eigen_assert(trunc > 0);
|
||||||
|
|
||||||
|
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
|
||||||
|
Mat X_trunc = svd.solve(B, trunc);
|
||||||
|
|
||||||
|
// Build CPU reference: truncated pseudoinverse.
|
||||||
|
auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
|
||||||
|
auto S = cpu_svd.singularValues();
|
||||||
|
Mat U = cpu_svd.matrixU();
|
||||||
|
Mat V = cpu_svd.matrixV();
|
||||||
|
|
||||||
|
// D_ii = 1/S_i for i < trunc, 0 otherwise.
|
||||||
|
Matrix<RealScalar, Dynamic, 1> D = Matrix<RealScalar, Dynamic, 1>::Zero(k);
|
||||||
|
for (Index i = 0; i < trunc; ++i) D(i) = RealScalar(1) / S(i);
|
||||||
|
Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((X_trunc - X_ref).norm() / X_ref.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Solve: Tikhonov regularized --------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_svd_solve_regularized(Index m, Index n) {
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
Mat A = Mat::Random(m, n);
|
||||||
|
Mat B = Mat::Random(m, 1);
|
||||||
|
RealScalar lambda = RealScalar(0.1);
|
||||||
|
const Index k = (std::min)(m, n);
|
||||||
|
|
||||||
|
GpuSVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
|
||||||
|
Mat X_reg = svd.solve(B, lambda);
|
||||||
|
|
||||||
|
// CPU reference: D_ii = S_i / (S_i^2 + lambda^2).
|
||||||
|
auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
|
||||||
|
auto S = cpu_svd.singularValues();
|
||||||
|
Mat U = cpu_svd.matrixU();
|
||||||
|
Mat V = cpu_svd.matrixV();
|
||||||
|
|
||||||
|
Matrix<RealScalar, Dynamic, 1> D(k);
|
||||||
|
for (Index i = 0; i < k; ++i) D(i) = S(i) / (S(i) * S(i) + lambda * lambda);
|
||||||
|
Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((X_reg - X_ref).norm() / X_ref.norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty matrix -----------------------------------------------------------
|
||||||
|
|
||||||
|
void test_svd_empty() {
|
||||||
|
GpuSVD<double> svd(MatrixXd(0, 0), 0);
|
||||||
|
VERIFY_IS_EQUAL(svd.info(), Success);
|
||||||
|
VERIFY_IS_EQUAL(svd.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(svd.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
// Reconstruction + orthogonality (thin and full, identical test logic).
|
||||||
|
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 64)));
|
||||||
|
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(128, 64)));
|
||||||
|
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 128))); // wide (m < n)
|
||||||
|
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(64, 64)));
|
||||||
|
CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(128, 64)));
|
||||||
|
|
||||||
|
// Singular values.
|
||||||
|
CALL_SUBTEST(test_svd_singular_values<Scalar>(64, 64));
|
||||||
|
CALL_SUBTEST(test_svd_singular_values<Scalar>(128, 64));
|
||||||
|
|
||||||
|
// Solve.
|
||||||
|
CALL_SUBTEST(test_svd_solve<Scalar>(64, 64, 4));
|
||||||
|
CALL_SUBTEST(test_svd_solve<Scalar>(128, 64, 4));
|
||||||
|
CALL_SUBTEST(test_svd_solve<Scalar>(64, 128, 4)); // wide (m < n)
|
||||||
|
|
||||||
|
// Truncated and regularized solve.
|
||||||
|
CALL_SUBTEST(test_svd_solve_truncated<Scalar>(64, 64));
|
||||||
|
CALL_SUBTEST(test_svd_solve_regularized<Scalar>(64, 64));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cusolver_svd) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_svd_empty());
|
||||||
|
}
|
||||||
305
test/gpu_cusparse_spmv.cpp
Normal file
305
test/gpu_cusparse_spmv.cpp
Normal file
@@ -0,0 +1,305 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for GpuSparseContext: GPU SpMV/SpMM via cuSPARSE.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Helper: build a random sparse matrix -----------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
SparseMatrix<Scalar, ColMajor, int> make_sparse(Index rows, Index cols, double density = 0.1) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat R(rows, cols);
|
||||||
|
R.reserve(VectorXi::Constant(cols, static_cast<int>(rows * density) + 1));
|
||||||
|
for (Index j = 0; j < cols; ++j) {
|
||||||
|
for (Index i = 0; i < rows; ++i) {
|
||||||
|
if ((std::rand() / double(RAND_MAX)) < density) {
|
||||||
|
R.insert(i, j) = Scalar(RealScalar(std::rand() / double(RAND_MAX) - 0.5));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
R.makeCompressed();
|
||||||
|
return R;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMV: y = A * x -------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_spmv(Index rows, Index cols) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_sparse<Scalar>(rows, cols);
|
||||||
|
Vec x = Vec::Random(cols);
|
||||||
|
|
||||||
|
GpuSparseContext<Scalar> ctx;
|
||||||
|
Vec y_gpu = ctx.multiply(A, x);
|
||||||
|
Vec y_cpu = A * x;
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY_IS_EQUAL(y_gpu.size(), rows);
|
||||||
|
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMV with alpha/beta: y = alpha*A*x + beta*y ---------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_spmv_alpha_beta(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_sparse<Scalar>(n, n);
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
Vec y_init = Vec::Random(n);
|
||||||
|
|
||||||
|
Scalar alpha(2);
|
||||||
|
Scalar beta(3);
|
||||||
|
|
||||||
|
Vec y_cpu = alpha * (A * x) + beta * y_init;
|
||||||
|
|
||||||
|
GpuSparseContext<Scalar> ctx;
|
||||||
|
Vec y_gpu = y_init;
|
||||||
|
ctx.multiply(A, x, y_gpu, alpha, beta);
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Transpose: y = A^T * x ------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_spmv_transpose(Index rows, Index cols) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_sparse<Scalar>(rows, cols);
|
||||||
|
Vec x = Vec::Random(rows);
|
||||||
|
|
||||||
|
GpuSparseContext<Scalar> ctx;
|
||||||
|
Vec y_gpu = ctx.multiplyT(A, x);
|
||||||
|
Vec y_cpu = A.transpose() * x;
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY_IS_EQUAL(y_gpu.size(), cols);
|
||||||
|
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- SpMM: Y = A * X (multiple RHS) ----------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_spmm(Index rows, Index cols, Index nrhs) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_sparse<Scalar>(rows, cols);
|
||||||
|
Mat X = Mat::Random(cols, nrhs);
|
||||||
|
|
||||||
|
GpuSparseContext<Scalar> ctx;
|
||||||
|
Mat Y_gpu = ctx.multiplyMat(A, X);
|
||||||
|
Mat Y_cpu = A * X;
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY_IS_EQUAL(Y_gpu.rows(), rows);
|
||||||
|
VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);
|
||||||
|
VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Identity matrix: I * x = x --------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_identity(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
// Build sparse identity.
|
||||||
|
SpMat eye(n, n);
|
||||||
|
eye.setIdentity();
|
||||||
|
eye.makeCompressed();
|
||||||
|
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuSparseContext<Scalar> ctx;
|
||||||
|
Vec y = ctx.multiply(eye, x);
|
||||||
|
|
||||||
|
RealScalar tol = NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((y - x).norm() < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Context reuse ----------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_reuse(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
GpuSparseContext<Scalar> ctx;
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
|
||||||
|
for (int trial = 0; trial < 3; ++trial) {
|
||||||
|
SpMat A = make_sparse<Scalar>(n, n);
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
Vec y_gpu = ctx.multiply(A, x);
|
||||||
|
Vec y_cpu = A * x;
|
||||||
|
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty ------------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_empty() {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
|
||||||
|
SpMat A(0, 0);
|
||||||
|
A.makeCompressed();
|
||||||
|
Vec x(0);
|
||||||
|
|
||||||
|
GpuSparseContext<Scalar> ctx;
|
||||||
|
Vec y = ctx.multiply(A, x);
|
||||||
|
VERIFY_IS_EQUAL(y.size(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- DeviceMatrix SpMV (no host roundtrip) ----------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_spmv_device(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_sparse<Scalar>(n, n);
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
|
||||||
|
// Use shared GpuContext for same-stream execution.
|
||||||
|
GpuContext gpu_ctx;
|
||||||
|
GpuSparseContext<Scalar> ctx(gpu_ctx);
|
||||||
|
|
||||||
|
auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
|
||||||
|
DeviceMatrix<Scalar> d_y;
|
||||||
|
|
||||||
|
ctx.multiply(A, d_x, d_y);
|
||||||
|
|
||||||
|
Vec y_gpu = d_y.toHost(gpu_ctx.stream());
|
||||||
|
Vec y_cpu = A * x;
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Expression syntax: d_y = d_A * d_x ------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_spmv_expr(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A = make_sparse<Scalar>(n, n);
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuContext gpu_ctx;
|
||||||
|
GpuSparseContext<Scalar> ctx(gpu_ctx);
|
||||||
|
|
||||||
|
// Upload sparse matrix and create device view.
|
||||||
|
auto d_A = ctx.deviceView(A);
|
||||||
|
|
||||||
|
// Upload x.
|
||||||
|
auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
|
||||||
|
|
||||||
|
// Expression syntax: d_y = d_A * d_x
|
||||||
|
DeviceMatrix<Scalar> d_y;
|
||||||
|
d_y = d_A * d_x;
|
||||||
|
|
||||||
|
// Also test with noalias():
|
||||||
|
DeviceMatrix<Scalar> d_tmp;
|
||||||
|
d_tmp.noalias() = d_A * d_x;
|
||||||
|
|
||||||
|
Vec y_gpu = d_y.toHost(gpu_ctx.stream());
|
||||||
|
Vec tmp_gpu = d_tmp.toHost(gpu_ctx.stream());
|
||||||
|
Vec y_cpu = A * x;
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
VERIFY((tmp_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- deviceView overwrite: second view replaces first -----------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_deviceview_overwrite(Index n) {
|
||||||
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
SpMat A1 = make_sparse<Scalar>(n, n);
|
||||||
|
SpMat A2 = make_sparse<Scalar>(n, n); // different random matrix
|
||||||
|
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuContext gpu_ctx;
|
||||||
|
GpuSparseContext<Scalar> ctx(gpu_ctx);
|
||||||
|
|
||||||
|
// First view: A1.
|
||||||
|
auto d_A1 = ctx.deviceView(A1);
|
||||||
|
auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
|
||||||
|
DeviceMatrix<Scalar> d_y1;
|
||||||
|
d_y1 = d_A1 * d_x;
|
||||||
|
Vec y1_gpu = d_y1.toHost(gpu_ctx.stream());
|
||||||
|
Vec y1_cpu = A1 * x;
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((y1_gpu - y1_cpu).norm() / (y1_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
|
||||||
|
// Second view overwrites first: now uses A2.
|
||||||
|
auto d_A2 = ctx.deviceView(A2);
|
||||||
|
DeviceMatrix<Scalar> d_y2;
|
||||||
|
d_y2 = d_A2 * d_x;
|
||||||
|
Vec y2_gpu = d_y2.toHost(gpu_ctx.stream());
|
||||||
|
Vec y2_cpu = A2 * x;
|
||||||
|
VERIFY((y2_gpu - y2_cpu).norm() / (y2_cpu.norm() + RealScalar(1)) < tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
CALL_SUBTEST(test_spmv<Scalar>(64, 64));
|
||||||
|
CALL_SUBTEST(test_spmv<Scalar>(128, 64)); // non-square
|
||||||
|
CALL_SUBTEST(test_spmv<Scalar>(64, 128)); // wide
|
||||||
|
CALL_SUBTEST(test_spmv_alpha_beta<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_spmv_transpose<Scalar>(128, 64));
|
||||||
|
CALL_SUBTEST(test_spmm<Scalar>(64, 64, 4));
|
||||||
|
CALL_SUBTEST(test_identity<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_reuse<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_empty<Scalar>());
|
||||||
|
CALL_SUBTEST(test_spmv_device<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_spmv_expr<Scalar>(64));
|
||||||
|
CALL_SUBTEST(test_deviceview_overwrite<Scalar>(64));
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
}
|
||||||
471
test/gpu_device_matrix.cpp
Normal file
471
test/gpu_device_matrix.cpp
Normal file
@@ -0,0 +1,471 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Tests for DeviceMatrix and HostTransfer: typed RAII GPU memory wrapper.
|
||||||
|
// No cuSOLVER dependency — only CUDA runtime.
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include <Eigen/Sparse>
|
||||||
|
#include <Eigen/GPU>
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
|
||||||
|
// ---- Default construction ---------------------------------------------------
|
||||||
|
|
||||||
|
void test_default_construct() {
|
||||||
|
DeviceMatrix<double> dm;
|
||||||
|
VERIFY(dm.empty());
|
||||||
|
VERIFY_IS_EQUAL(dm.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(dm.cols(), 0);
|
||||||
|
VERIFY(dm.data() == nullptr);
|
||||||
|
VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Allocate uninitialized -------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_allocate(Index rows, Index cols) {
|
||||||
|
DeviceMatrix<Scalar> dm(rows, cols);
|
||||||
|
VERIFY(!dm.empty());
|
||||||
|
VERIFY_IS_EQUAL(dm.rows(), rows);
|
||||||
|
VERIFY_IS_EQUAL(dm.cols(), cols);
|
||||||
|
VERIFY(dm.data() != nullptr);
|
||||||
|
VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- fromHost / toHost roundtrip (synchronous) ------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_roundtrip(Index rows, Index cols) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
MatrixType host = MatrixType::Random(rows, cols);
|
||||||
|
|
||||||
|
auto dm = DeviceMatrix<Scalar>::fromHost(host);
|
||||||
|
VERIFY_IS_EQUAL(dm.rows(), rows);
|
||||||
|
VERIFY_IS_EQUAL(dm.cols(), cols);
|
||||||
|
VERIFY(!dm.empty());
|
||||||
|
|
||||||
|
MatrixType result = dm.toHost();
|
||||||
|
VERIFY_IS_EQUAL(result.rows(), rows);
|
||||||
|
VERIFY_IS_EQUAL(result.cols(), cols);
|
||||||
|
VERIFY_IS_APPROX(result, host);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- fromHostAsync / toHostAsync roundtrip -----------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_roundtrip_async(Index rows, Index cols) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
MatrixType host = MatrixType::Random(rows, cols);
|
||||||
|
|
||||||
|
cudaStream_t stream;
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream));
|
||||||
|
|
||||||
|
// Async upload from raw pointer.
|
||||||
|
auto dm = DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, stream);
|
||||||
|
VERIFY_IS_EQUAL(dm.rows(), rows);
|
||||||
|
VERIFY_IS_EQUAL(dm.cols(), cols);
|
||||||
|
|
||||||
|
// Async download via HostTransfer future.
|
||||||
|
auto transfer = dm.toHostAsync(stream);
|
||||||
|
|
||||||
|
// get() blocks and returns the matrix.
|
||||||
|
MatrixType result = transfer.get();
|
||||||
|
VERIFY_IS_APPROX(result, host);
|
||||||
|
|
||||||
|
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(stream));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- HostTransfer::ready() and idempotent get() -----------------------------
|
||||||
|
|
||||||
|
void test_host_transfer_ready() {
|
||||||
|
using MatrixType = Matrix<double, Dynamic, Dynamic>;
|
||||||
|
MatrixType host = MatrixType::Random(100, 100);
|
||||||
|
|
||||||
|
auto dm = DeviceMatrix<double>::fromHost(host);
|
||||||
|
auto transfer = dm.toHostAsync();
|
||||||
|
|
||||||
|
// After get(), ready() must return true.
|
||||||
|
MatrixType result = transfer.get();
|
||||||
|
VERIFY(transfer.ready());
|
||||||
|
VERIFY_IS_APPROX(result, host);
|
||||||
|
|
||||||
|
// get() is idempotent.
|
||||||
|
MatrixType& result2 = transfer.get();
|
||||||
|
VERIFY_IS_APPROX(result2, host);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- HostTransfer move ------------------------------------------------------
|
||||||
|
|
||||||
|
void test_host_transfer_move() {
|
||||||
|
using MatrixType = Matrix<double, Dynamic, Dynamic>;
|
||||||
|
MatrixType host = MatrixType::Random(50, 50);
|
||||||
|
|
||||||
|
auto dm = DeviceMatrix<double>::fromHost(host);
|
||||||
|
auto transfer = dm.toHostAsync();
|
||||||
|
|
||||||
|
HostTransfer<double> moved(std::move(transfer));
|
||||||
|
MatrixType result = moved.get();
|
||||||
|
VERIFY_IS_APPROX(result, host);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- clone() produces independent copy --------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_clone(Index rows, Index cols) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
MatrixType host = MatrixType::Random(rows, cols);
|
||||||
|
|
||||||
|
auto dm = DeviceMatrix<Scalar>::fromHost(host);
|
||||||
|
auto cloned = dm.clone();
|
||||||
|
|
||||||
|
// Overwrite original with different data.
|
||||||
|
MatrixType other = MatrixType::Random(rows, cols);
|
||||||
|
dm = DeviceMatrix<Scalar>::fromHost(other);
|
||||||
|
|
||||||
|
// Clone still holds the original data.
|
||||||
|
MatrixType clone_result = cloned.toHost();
|
||||||
|
VERIFY_IS_APPROX(clone_result, host);
|
||||||
|
|
||||||
|
// Original holds the new data.
|
||||||
|
MatrixType dm_result = dm.toHost();
|
||||||
|
VERIFY_IS_APPROX(dm_result, other);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Move construct ---------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_move_construct(Index rows, Index cols) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
MatrixType host = MatrixType::Random(rows, cols);
|
||||||
|
|
||||||
|
auto dm = DeviceMatrix<Scalar>::fromHost(host);
|
||||||
|
DeviceMatrix<Scalar> moved(std::move(dm));
|
||||||
|
|
||||||
|
VERIFY(dm.empty());
|
||||||
|
VERIFY(dm.data() == nullptr);
|
||||||
|
|
||||||
|
VERIFY_IS_EQUAL(moved.rows(), rows);
|
||||||
|
VERIFY_IS_EQUAL(moved.cols(), cols);
|
||||||
|
MatrixType result = moved.toHost();
|
||||||
|
VERIFY_IS_APPROX(result, host);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Move assign ------------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_move_assign(Index rows, Index cols) {
|
||||||
|
using MatrixType = Matrix<Scalar, Dynamic, Dynamic>;
|
||||||
|
MatrixType host = MatrixType::Random(rows, cols);
|
||||||
|
|
||||||
|
auto dm = DeviceMatrix<Scalar>::fromHost(host);
|
||||||
|
DeviceMatrix<Scalar> dest;
|
||||||
|
dest = std::move(dm);
|
||||||
|
|
||||||
|
VERIFY(dm.empty());
|
||||||
|
VERIFY_IS_EQUAL(dest.rows(), rows);
|
||||||
|
MatrixType result = dest.toHost();
|
||||||
|
VERIFY_IS_APPROX(result, host);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- resize() ---------------------------------------------------------------
|
||||||
|
|
||||||
|
void test_resize() {
|
||||||
|
DeviceMatrix<double> dm(10, 20);
|
||||||
|
VERIFY_IS_EQUAL(dm.rows(), 10);
|
||||||
|
VERIFY_IS_EQUAL(dm.cols(), 20);
|
||||||
|
|
||||||
|
dm.resize(50, 30);
|
||||||
|
VERIFY_IS_EQUAL(dm.rows(), 50);
|
||||||
|
VERIFY_IS_EQUAL(dm.cols(), 30);
|
||||||
|
VERIFY(dm.data() != nullptr);
|
||||||
|
|
||||||
|
// Resize to same dimensions is a no-op.
|
||||||
|
double* ptr_before = dm.data();
|
||||||
|
dm.resize(50, 30);
|
||||||
|
VERIFY(dm.data() == ptr_before);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Empty / 0x0 matrix -----------------------------------------------------
|
||||||
|
|
||||||
|
void test_empty() {
|
||||||
|
using MatrixType = Matrix<double, Dynamic, Dynamic>;
|
||||||
|
MatrixType empty_mat(0, 0);
|
||||||
|
|
||||||
|
auto dm = DeviceMatrix<double>::fromHost(empty_mat);
|
||||||
|
VERIFY(dm.empty());
|
||||||
|
VERIFY_IS_EQUAL(dm.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(dm.cols(), 0);
|
||||||
|
|
||||||
|
MatrixType result = dm.toHost();
|
||||||
|
VERIFY_IS_EQUAL(result.rows(), 0);
|
||||||
|
VERIFY_IS_EQUAL(result.cols(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Per-scalar driver ------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_scalar() {
|
||||||
|
// Square.
|
||||||
|
CALL_SUBTEST(test_roundtrip<Scalar>(1, 1));
|
||||||
|
CALL_SUBTEST(test_roundtrip<Scalar>(64, 64));
|
||||||
|
CALL_SUBTEST(test_roundtrip<Scalar>(256, 256));
|
||||||
|
|
||||||
|
// Rectangular.
|
||||||
|
CALL_SUBTEST(test_roundtrip<Scalar>(100, 7));
|
||||||
|
CALL_SUBTEST(test_roundtrip<Scalar>(7, 100));
|
||||||
|
|
||||||
|
// Async roundtrip.
|
||||||
|
CALL_SUBTEST(test_roundtrip_async<Scalar>(64, 64));
|
||||||
|
CALL_SUBTEST(test_roundtrip_async<Scalar>(100, 7));
|
||||||
|
|
||||||
|
CALL_SUBTEST(test_clone<Scalar>(64, 64));
|
||||||
|
CALL_SUBTEST(test_move_construct<Scalar>(64, 64));
|
||||||
|
CALL_SUBTEST(test_move_assign<Scalar>(64, 64));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- BLAS-1: dot product ----------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_blas1(Index n) {
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
// All BLAS-1 ops share one GpuContext — same stream, zero event overhead.
|
||||||
|
GpuContext ctx;
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
|
||||||
|
// dot
|
||||||
|
{
|
||||||
|
Vec a = Vec::Random(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
|
||||||
|
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
|
||||||
|
Scalar gpu_dot = d_a.dot(ctx, d_b);
|
||||||
|
Scalar cpu_dot = a.dot(b);
|
||||||
|
VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// norm / squaredNorm
|
||||||
|
{
|
||||||
|
Vec a = Vec::Random(n);
|
||||||
|
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
|
||||||
|
RealScalar gpu_norm = d_a.norm(ctx);
|
||||||
|
RealScalar cpu_norm = a.norm();
|
||||||
|
VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol);
|
||||||
|
RealScalar gpu_sqnorm = d_a.squaredNorm(ctx);
|
||||||
|
RealScalar cpu_sqnorm = a.squaredNorm();
|
||||||
|
VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// addScaled (axpy)
|
||||||
|
{
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
Vec y = Vec::Random(n);
|
||||||
|
Scalar alpha(2.5);
|
||||||
|
Vec y_ref = y + alpha * x;
|
||||||
|
auto d_y = DeviceMatrix<Scalar>::fromHost(y, ctx.stream());
|
||||||
|
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
|
||||||
|
d_y.addScaled(ctx, alpha, d_x);
|
||||||
|
Vec y_gpu = d_y.toHost(ctx.stream());
|
||||||
|
VERIFY((y_gpu - y_ref).norm() < tol * y_ref.norm() + tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// scale (scal)
|
||||||
|
{
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
Scalar alpha(3.0);
|
||||||
|
Vec x_ref = alpha * x;
|
||||||
|
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
|
||||||
|
d_x.scale(ctx, alpha);
|
||||||
|
Vec x_gpu = d_x.toHost(ctx.stream());
|
||||||
|
VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// copyFrom
|
||||||
|
{
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
|
||||||
|
DeviceMatrix<Scalar> d_y;
|
||||||
|
d_y.copyFrom(ctx, d_x);
|
||||||
|
Vec y = d_y.toHost(ctx.stream());
|
||||||
|
VERIFY_IS_APPROX(y, x);
|
||||||
|
}
|
||||||
|
|
||||||
|
// setZero
|
||||||
|
{
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
|
||||||
|
d_x.setZero(ctx);
|
||||||
|
Vec result = d_x.toHost(ctx.stream());
|
||||||
|
VERIFY_IS_EQUAL(result, Vec::Zero(n));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- BLAS-1 operator overloads (CG-style) -----------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_cg_operators(Index n) {
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
|
||||||
|
Vec x = Vec::Random(n);
|
||||||
|
Vec p = Vec::Random(n);
|
||||||
|
Vec tmp = Vec::Random(n);
|
||||||
|
Vec z = Vec::Random(n);
|
||||||
|
Scalar alpha(2.5);
|
||||||
|
Scalar beta(0.7);
|
||||||
|
|
||||||
|
// Test: x += alpha * p
|
||||||
|
{
|
||||||
|
Vec x_ref = x + alpha * p;
|
||||||
|
auto d_x = DeviceMatrix<Scalar>::fromHost(x);
|
||||||
|
auto d_p = DeviceMatrix<Scalar>::fromHost(p);
|
||||||
|
d_x += alpha * d_p;
|
||||||
|
Vec x_gpu = d_x.toHost();
|
||||||
|
VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test: r -= alpha * tmp
|
||||||
|
{
|
||||||
|
Vec r = Vec::Random(n);
|
||||||
|
Vec r_ref = r - alpha * tmp;
|
||||||
|
auto d_r = DeviceMatrix<Scalar>::fromHost(r);
|
||||||
|
auto d_tmp = DeviceMatrix<Scalar>::fromHost(tmp);
|
||||||
|
d_r -= alpha * d_tmp;
|
||||||
|
Vec r_gpu = d_r.toHost();
|
||||||
|
VERIFY((r_gpu - r_ref).norm() < tol * r_ref.norm() + tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test: p = z + beta * p (cuBLAS geam)
|
||||||
|
{
|
||||||
|
Vec p_copy = p;
|
||||||
|
Vec p_ref = z + beta * p_copy;
|
||||||
|
auto d_p = DeviceMatrix<Scalar>::fromHost(p_copy);
|
||||||
|
auto d_z = DeviceMatrix<Scalar>::fromHost(z);
|
||||||
|
d_p = d_z + beta * d_p;
|
||||||
|
Vec p_gpu = d_p.toHost();
|
||||||
|
VERIFY((p_gpu - p_ref).norm() < tol * p_ref.norm() + tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test: operator+= and operator-= with DeviceMatrix (no scalar)
|
||||||
|
{
|
||||||
|
Vec a = Vec::Random(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
Vec a_ref = a + b;
|
||||||
|
auto d_a = DeviceMatrix<Scalar>::fromHost(a);
|
||||||
|
auto d_b = DeviceMatrix<Scalar>::fromHost(b);
|
||||||
|
d_a += d_b;
|
||||||
|
VERIFY((d_a.toHost() - a_ref).norm() < tol * a_ref.norm() + tol);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- DeviceScalar: deferred sync -------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_device_scalar() {
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
const Index n = 256;
|
||||||
|
Vec a = Vec::Random(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
|
||||||
|
GpuContext ctx;
|
||||||
|
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
|
||||||
|
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
|
||||||
|
|
||||||
|
// dot() returns DeviceScalar — implicit conversion to Scalar syncs.
|
||||||
|
Scalar gpu_dot = d_a.dot(ctx, d_b);
|
||||||
|
Scalar cpu_dot = a.dot(b);
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
|
||||||
|
|
||||||
|
// squaredNorm() returns host RealScalar directly (syncs internally).
|
||||||
|
RealScalar gpu_sqnorm = d_a.squaredNorm(ctx);
|
||||||
|
RealScalar cpu_sqnorm = a.squaredNorm();
|
||||||
|
VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol);
|
||||||
|
|
||||||
|
// norm() returns DeviceScalar<RealScalar> — implicit conversion syncs.
|
||||||
|
RealScalar gpu_norm = d_a.norm(ctx);
|
||||||
|
RealScalar cpu_norm = a.norm();
|
||||||
|
VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol);
|
||||||
|
|
||||||
|
// Convenience overloads (thread-local context).
|
||||||
|
GpuContext::setThreadLocal(&ctx);
|
||||||
|
Scalar gpu_dot2 = d_a.dot(d_b);
|
||||||
|
VERIFY(numext::abs(gpu_dot2 - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
|
||||||
|
GpuContext::setThreadLocal(nullptr);
|
||||||
|
|
||||||
|
// Empty vectors: dot and norm must return zero.
|
||||||
|
{
|
||||||
|
DeviceMatrix<Scalar> d_empty(0, 1);
|
||||||
|
DeviceMatrix<Scalar> d_empty2(0, 1);
|
||||||
|
Scalar empty_dot = d_empty.dot(ctx, d_empty2);
|
||||||
|
VERIFY_IS_EQUAL(empty_dot, Scalar(0));
|
||||||
|
RealScalar empty_sqnorm = d_empty.squaredNorm(ctx);
|
||||||
|
VERIFY_IS_EQUAL(empty_sqnorm, RealScalar(0));
|
||||||
|
RealScalar empty_norm = d_empty.norm(ctx);
|
||||||
|
VERIFY_IS_EQUAL(empty_norm, RealScalar(0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- cwiseProduct -----------------------------------------------------------
|
||||||
|
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_cwiseProduct() {
|
||||||
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
||||||
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
||||||
|
|
||||||
|
const Index n = 256;
|
||||||
|
Vec a = Vec::Random(n);
|
||||||
|
Vec b = Vec::Random(n);
|
||||||
|
Vec ref = a.array() * b.array();
|
||||||
|
|
||||||
|
GpuContext ctx;
|
||||||
|
auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
|
||||||
|
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
|
||||||
|
auto d_c = d_a.cwiseProduct(ctx, d_b);
|
||||||
|
Vec result = d_c.toHost(ctx.stream());
|
||||||
|
|
||||||
|
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
||||||
|
VERIFY((result - ref).norm() < tol * ref.norm() + tol);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_device_matrix) {
|
||||||
|
CALL_SUBTEST(test_default_construct());
|
||||||
|
CALL_SUBTEST(test_empty());
|
||||||
|
CALL_SUBTEST(test_resize());
|
||||||
|
CALL_SUBTEST(test_host_transfer_ready());
|
||||||
|
CALL_SUBTEST(test_host_transfer_move());
|
||||||
|
CALL_SUBTEST((test_allocate<float>(100, 50)));
|
||||||
|
CALL_SUBTEST((test_allocate<double>(100, 50)));
|
||||||
|
CALL_SUBTEST(test_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_blas1<float>(256));
|
||||||
|
CALL_SUBTEST(test_blas1<double>(256));
|
||||||
|
CALL_SUBTEST(test_blas1<std::complex<float>>(256));
|
||||||
|
CALL_SUBTEST(test_blas1<std::complex<double>>(256));
|
||||||
|
CALL_SUBTEST(test_cg_operators<float>(256));
|
||||||
|
CALL_SUBTEST(test_cg_operators<double>(256));
|
||||||
|
CALL_SUBTEST(test_cg_operators<std::complex<float>>(256));
|
||||||
|
CALL_SUBTEST(test_cg_operators<std::complex<double>>(256));
|
||||||
|
CALL_SUBTEST(test_device_scalar<float>());
|
||||||
|
CALL_SUBTEST(test_device_scalar<double>());
|
||||||
|
CALL_SUBTEST(test_device_scalar<std::complex<float>>());
|
||||||
|
CALL_SUBTEST(test_device_scalar<std::complex<double>>());
|
||||||
|
CALL_SUBTEST(test_cwiseProduct<float>());
|
||||||
|
CALL_SUBTEST(test_cwiseProduct<double>());
|
||||||
|
}
|
||||||
110
test/gpu_library_example.cu
Normal file
110
test/gpu_library_example.cu
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
// Smoke test for GPU library test infrastructure.
|
||||||
|
// Verifies GpuContext, GpuBuffer, and host<->device matrix transfers
|
||||||
|
// without requiring any NVIDIA library (cuBLAS, cuSOLVER, etc.).
|
||||||
|
|
||||||
|
#define EIGEN_USE_GPU
|
||||||
|
#include "main.h"
|
||||||
|
#include "gpu_context.h"
|
||||||
|
#include "gpu_library_test_helper.h"
|
||||||
|
|
||||||
|
using namespace Eigen;
|
||||||
|
using namespace Eigen::test;
|
||||||
|
|
||||||
|
// Test that GpuContext initializes, reports valid device info, and owns a cuSOLVER handle.
|
||||||
|
void test_gpu_context() {
|
||||||
|
GpuContext ctx;
|
||||||
|
VERIFY(ctx.device() >= 0);
|
||||||
|
VERIFY(ctx.deviceProperties().major >= 7); // sm_70 minimum
|
||||||
|
VERIFY(ctx.stream != nullptr);
|
||||||
|
VERIFY(ctx.cusolver != nullptr);
|
||||||
|
std::cout << " GPU: " << ctx.deviceProperties().name << " (sm_" << ctx.deviceProperties().major
|
||||||
|
<< ctx.deviceProperties().minor << ")\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test dense matrix roundtrip: host -> device -> host.
|
||||||
|
template <typename MatrixType>
|
||||||
|
void test_dense_roundtrip() {
|
||||||
|
GpuContext ctx;
|
||||||
|
const Index rows = 64;
|
||||||
|
const Index cols = 32;
|
||||||
|
|
||||||
|
MatrixType A = MatrixType::Random(rows, cols);
|
||||||
|
auto buf = gpu_copy_to_device(ctx.stream, A);
|
||||||
|
VERIFY(buf.data != nullptr);
|
||||||
|
VERIFY(buf.size == rows * cols);
|
||||||
|
|
||||||
|
MatrixType B(rows, cols);
|
||||||
|
B.setZero();
|
||||||
|
gpu_copy_to_host(ctx.stream, buf, B);
|
||||||
|
ctx.synchronize();
|
||||||
|
|
||||||
|
VERIFY_IS_EQUAL(A, B);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test GpuBuffer RAII: move semantics, async zero-init.
|
||||||
|
void test_gpu_buffer() {
|
||||||
|
GpuContext ctx;
|
||||||
|
|
||||||
|
GpuBuffer<float> a(128);
|
||||||
|
VERIFY(a.data != nullptr);
|
||||||
|
VERIFY(a.size == 128);
|
||||||
|
|
||||||
|
// Move construction.
|
||||||
|
GpuBuffer<float> b(std::move(a));
|
||||||
|
VERIFY(a.data == nullptr);
|
||||||
|
VERIFY(b.data != nullptr);
|
||||||
|
VERIFY(b.size == 128);
|
||||||
|
|
||||||
|
// Move assignment.
|
||||||
|
GpuBuffer<float> c;
|
||||||
|
c = std::move(b);
|
||||||
|
VERIFY(b.data == nullptr);
|
||||||
|
VERIFY(c.data != nullptr);
|
||||||
|
|
||||||
|
// setZeroAsync.
|
||||||
|
c.setZeroAsync(ctx.stream);
|
||||||
|
ctx.synchronize();
|
||||||
|
|
||||||
|
std::vector<float> host(128);
|
||||||
|
GPU_CHECK(cudaMemcpy(host.data(), c.data, 128 * sizeof(float), cudaMemcpyDeviceToHost));
|
||||||
|
for (int i = 0; i < 128; ++i) {
|
||||||
|
VERIFY_IS_EQUAL(host[i], 0.0f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test with vectors (1D).
|
||||||
|
template <typename Scalar>
|
||||||
|
void test_vector_roundtrip() {
|
||||||
|
GpuContext ctx;
|
||||||
|
const Index n = 256;
|
||||||
|
Matrix<Scalar, Dynamic, 1> v = Matrix<Scalar, Dynamic, 1>::Random(n);
|
||||||
|
auto buf = gpu_copy_to_device(ctx.stream, v);
|
||||||
|
|
||||||
|
Matrix<Scalar, Dynamic, 1> w(n);
|
||||||
|
w.setZero();
|
||||||
|
gpu_copy_to_host(ctx.stream, buf, w);
|
||||||
|
ctx.synchronize();
|
||||||
|
|
||||||
|
VERIFY_IS_EQUAL(v, w);
|
||||||
|
}
|
||||||
|
|
||||||
|
EIGEN_DECLARE_TEST(gpu_library_example) {
|
||||||
|
CALL_SUBTEST(test_gpu_context());
|
||||||
|
CALL_SUBTEST(test_gpu_buffer());
|
||||||
|
CALL_SUBTEST(test_dense_roundtrip<MatrixXf>());
|
||||||
|
CALL_SUBTEST(test_dense_roundtrip<MatrixXd>());
|
||||||
|
CALL_SUBTEST((test_dense_roundtrip<Matrix<float, Dynamic, Dynamic, RowMajor>>()));
|
||||||
|
CALL_SUBTEST((test_dense_roundtrip<Matrix<double, Dynamic, Dynamic, RowMajor>>()));
|
||||||
|
CALL_SUBTEST(test_vector_roundtrip<float>());
|
||||||
|
CALL_SUBTEST(test_vector_roundtrip<double>());
|
||||||
|
CALL_SUBTEST(test_vector_roundtrip<std::complex<float>>());
|
||||||
|
}
|
||||||
90
test/gpu_library_test_helper.h
Normal file
90
test/gpu_library_test_helper.h
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
#ifndef EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
|
||||||
|
#define EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
|
||||||
|
|
||||||
|
// Helpers for GPU tests that call NVIDIA library APIs (cuBLAS, cuSOLVER, etc.)
|
||||||
|
// from the host side. Provides RAII GPU memory management and async matrix transfer.
|
||||||
|
//
|
||||||
|
// This is separate from gpu_common.h (element-parallel device kernels) and
|
||||||
|
// gpu_test_helper.h (serialization-based device kernels). Those patterns run
|
||||||
|
// user functors inside GPU kernels. This helper is for host-orchestrated tests
|
||||||
|
// that call library APIs which launch their own kernels internally.
|
||||||
|
//
|
||||||
|
// All transfers use an explicit stream and cudaMemcpyAsync. Callers must
|
||||||
|
// synchronize (ctx.synchronize() or cudaStreamSynchronize) before reading
|
||||||
|
// results back on the host.
|
||||||
|
|
||||||
|
#include "gpu_test_helper.h"
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace test {
|
||||||
|
|
||||||
|
// RAII wrapper for GPU device memory. Prevents leaks when VERIFY macros abort.
|
||||||
|
template <typename Scalar>
|
||||||
|
struct GpuBuffer {
|
||||||
|
Scalar* data = nullptr;
|
||||||
|
Index size = 0;
|
||||||
|
|
||||||
|
GpuBuffer() = default;
|
||||||
|
|
||||||
|
explicit GpuBuffer(Index n) : size(n) { GPU_CHECK(gpuMalloc(reinterpret_cast<void**>(&data), n * sizeof(Scalar))); }
|
||||||
|
|
||||||
|
~GpuBuffer() {
|
||||||
|
if (data) GPU_CHECK(gpuFree(data));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Move-only.
|
||||||
|
GpuBuffer(GpuBuffer&& other) noexcept : data(other.data), size(other.size) {
|
||||||
|
other.data = nullptr;
|
||||||
|
other.size = 0;
|
||||||
|
}
|
||||||
|
GpuBuffer& operator=(GpuBuffer&& other) noexcept {
|
||||||
|
if (this != &other) {
|
||||||
|
if (data) GPU_CHECK(gpuFree(data));
|
||||||
|
data = other.data;
|
||||||
|
size = other.size;
|
||||||
|
other.data = nullptr;
|
||||||
|
other.size = 0;
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
GpuBuffer(const GpuBuffer&) = delete;
|
||||||
|
GpuBuffer& operator=(const GpuBuffer&) = delete;
|
||||||
|
|
||||||
|
// Async zero the buffer on the given stream.
|
||||||
|
void setZeroAsync(cudaStream_t stream) { GPU_CHECK(cudaMemsetAsync(data, 0, size * sizeof(Scalar), stream)); }
|
||||||
|
};
|
||||||
|
|
||||||
|
// Copy a dense Eigen matrix to a new GPU buffer, async on the given stream.
|
||||||
|
// Caller must synchronize before the host matrix is freed or modified.
|
||||||
|
template <typename Derived>
|
||||||
|
GpuBuffer<typename Derived::Scalar> gpu_copy_to_device(cudaStream_t stream, const MatrixBase<Derived>& host_mat) {
|
||||||
|
using Scalar = typename Derived::Scalar;
|
||||||
|
const auto& mat = host_mat.derived();
|
||||||
|
GpuBuffer<Scalar> buf(mat.size());
|
||||||
|
GPU_CHECK(cudaMemcpyAsync(buf.data, mat.data(), mat.size() * sizeof(Scalar), cudaMemcpyHostToDevice, stream));
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy GPU buffer contents back to a dense Eigen matrix, async on the given stream.
|
||||||
|
// Caller must synchronize before reading from host_mat.
|
||||||
|
template <typename Scalar, typename Derived>
|
||||||
|
void gpu_copy_to_host(cudaStream_t stream, const GpuBuffer<Scalar>& buf, MatrixBase<Derived>& host_mat) {
|
||||||
|
auto& mat = host_mat.derived();
|
||||||
|
eigen_assert(buf.size == mat.size());
|
||||||
|
GPU_CHECK(cudaMemcpyAsync(mat.data(), buf.data, mat.size() * sizeof(Scalar), cudaMemcpyDeviceToHost, stream));
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace test
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
#endif // EIGEN_TEST_GPU_LIBRARY_TEST_HELPER_H
|
||||||
Reference in New Issue
Block a user