Files
eigen/benchmarks/GPU/bench_gpu_solvers.cpp
Rasmus Munk Larsen 58c44ef36d GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.

Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
  async host transfer (fromHost/toHost) and CUDA event-based cross-stream
  synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
  handle + cuSOLVER handle. Thread-local default with explicit override
  via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.

cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)

cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.

CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.

Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 19:05:25 -07:00

297 lines
9.7 KiB
C++

// GPU solver benchmarks: GpuLLT and GpuLU compute + solve throughput.
//
// Measures factorization and solve performance for the host-matrix and
// DeviceMatrix code paths across a range of matrix sizes.
//
// For Nsight Systems profiling:
// nsys profile --trace=cuda,nvtx ./bench_gpu_solvers
//
// For Nsight Compute kernel analysis:
// ncu --set full -o profile ./bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
#include <benchmark/benchmark.h>
#include <Eigen/Cholesky>
#include <Eigen/GPU>
#include <Eigen/LU>
using namespace Eigen;
#ifndef SCALAR
#define SCALAR double
#endif
using Scalar = SCALAR;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
// --------------------------------------------------------------------------
// Helpers
// --------------------------------------------------------------------------
static Mat make_spd(Index n) {
Mat M = Mat::Random(n, n);
return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
}
// CUDA warm-up: ensure the GPU is initialized before timing.
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 1);
cudaFree(p);
done = true;
}
}
// --------------------------------------------------------------------------
// GpuLLT benchmarks
// --------------------------------------------------------------------------
// Factorize from host matrix (includes H2D upload).
static void BM_GpuLLT_Compute_Host(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
Mat A = make_spd(n);
GpuLLT<Scalar> llt;
for (auto _ : state) {
llt.compute(A);
if (llt.info() != Success) state.SkipWithError("factorization failed");
}
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
state.counters["GFLOPS"] =
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
state.counters["n"] = n;
}
// Factorize from DeviceMatrix (D2D copy path).
static void BM_GpuLLT_Compute_Device(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
Mat A = make_spd(n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
GpuLLT<Scalar> llt;
for (auto _ : state) {
llt.compute(d_A);
if (llt.info() != Success) state.SkipWithError("factorization failed");
}
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
state.counters["GFLOPS"] =
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
state.counters["n"] = n;
}
// Factorize from DeviceMatrix (move path, no copy).
static void BM_GpuLLT_Compute_DeviceMove(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
Mat A = make_spd(n);
GpuLLT<Scalar> llt;
for (auto _ : state) {
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
llt.compute(std::move(d_A));
if (llt.info() != Success) state.SkipWithError("factorization failed");
}
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
state.counters["GFLOPS"] =
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
state.counters["n"] = n;
}
// Solve from host matrix (H2D + potrs + D2H).
static void BM_GpuLLT_Solve_Host(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
const Index nrhs = state.range(1);
Mat A = make_spd(n);
Mat B = Mat::Random(n, nrhs);
GpuLLT<Scalar> llt(A);
for (auto _ : state) {
Mat X = llt.solve(B);
benchmark::DoNotOptimize(X.data());
}
state.counters["n"] = n;
state.counters["nrhs"] = nrhs;
}
// Solve from DeviceMatrix (D2D + potrs, async, toHost at end).
static void BM_GpuLLT_Solve_Device(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
const Index nrhs = state.range(1);
Mat A = make_spd(n);
Mat B = Mat::Random(n, nrhs);
GpuLLT<Scalar> llt(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
for (auto _ : state) {
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
Mat X = d_X.toHost();
benchmark::DoNotOptimize(X.data());
}
state.counters["n"] = n;
state.counters["nrhs"] = nrhs;
}
// Solve staying entirely on device (no toHost — measures pure GPU time).
static void BM_GpuLLT_Solve_DeviceOnly(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
const Index nrhs = state.range(1);
Mat A = make_spd(n);
Mat B = Mat::Random(n, nrhs);
GpuLLT<Scalar> llt(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
for (auto _ : state) {
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
// Force completion without D2H transfer.
cudaStreamSynchronize(llt.stream());
benchmark::DoNotOptimize(d_X.data());
}
state.counters["n"] = n;
state.counters["nrhs"] = nrhs;
}
// --------------------------------------------------------------------------
// GpuLU benchmarks
// --------------------------------------------------------------------------
static void BM_GpuLU_Compute_Host(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
Mat A = Mat::Random(n, n);
GpuLU<Scalar> lu;
for (auto _ : state) {
lu.compute(A);
if (lu.info() != Success) state.SkipWithError("factorization failed");
}
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
state.counters["GFLOPS"] =
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
state.counters["n"] = n;
}
static void BM_GpuLU_Compute_Device(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
Mat A = Mat::Random(n, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
GpuLU<Scalar> lu;
for (auto _ : state) {
lu.compute(d_A);
if (lu.info() != Success) state.SkipWithError("factorization failed");
}
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
state.counters["GFLOPS"] =
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
state.counters["n"] = n;
}
static void BM_GpuLU_Solve_Host(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
const Index nrhs = state.range(1);
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
GpuLU<Scalar> lu(A);
for (auto _ : state) {
Mat X = lu.solve(B);
benchmark::DoNotOptimize(X.data());
}
state.counters["n"] = n;
state.counters["nrhs"] = nrhs;
}
static void BM_GpuLU_Solve_Device(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
const Index nrhs = state.range(1);
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
GpuLU<Scalar> lu(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
for (auto _ : state) {
DeviceMatrix<Scalar> d_X = lu.solve(d_B);
Mat X = d_X.toHost();
benchmark::DoNotOptimize(X.data());
}
state.counters["n"] = n;
state.counters["nrhs"] = nrhs;
}
// --------------------------------------------------------------------------
// CPU baselines for comparison
// --------------------------------------------------------------------------
static void BM_CpuLLT_Compute(benchmark::State& state) {
const Index n = state.range(0);
Mat A = make_spd(n);
LLT<Mat> llt;
for (auto _ : state) {
llt.compute(A);
benchmark::DoNotOptimize(llt.matrixLLT().data());
}
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
state.counters["GFLOPS"] =
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
state.counters["n"] = n;
}
static void BM_CpuLU_Compute(benchmark::State& state) {
const Index n = state.range(0);
Mat A = Mat::Random(n, n);
PartialPivLU<Mat> lu;
for (auto _ : state) {
lu.compute(A);
benchmark::DoNotOptimize(lu.matrixLU().data());
}
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
state.counters["GFLOPS"] =
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
state.counters["n"] = n;
}
// --------------------------------------------------------------------------
// Registration
// --------------------------------------------------------------------------
// clang-format off
BENCHMARK(BM_GpuLLT_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLLT_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLLT_Compute_DeviceMove)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLLT_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLLT_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLLT_Solve_DeviceOnly)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLU_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLU_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLU_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_GpuLU_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_CpuLLT_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_CpuLU_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
// clang-format on