mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
297 lines
9.7 KiB
C++
297 lines
9.7 KiB
C++
|
|
// GPU solver benchmarks: GpuLLT and GpuLU compute + solve throughput.
|
||
|
|
//
|
||
|
|
// Measures factorization and solve performance for the host-matrix and
|
||
|
|
// DeviceMatrix code paths across a range of matrix sizes.
|
||
|
|
//
|
||
|
|
// For Nsight Systems profiling:
|
||
|
|
// nsys profile --trace=cuda,nvtx ./bench_gpu_solvers
|
||
|
|
//
|
||
|
|
// For Nsight Compute kernel analysis:
|
||
|
|
// ncu --set full -o profile ./bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
|
||
|
|
|
||
|
|
#include <benchmark/benchmark.h>
|
||
|
|
|
||
|
|
#include <Eigen/Cholesky>
|
||
|
|
#include <Eigen/GPU>
|
||
|
|
#include <Eigen/LU>
|
||
|
|
|
||
|
|
using namespace Eigen;
|
||
|
|
|
||
|
|
#ifndef SCALAR
|
||
|
|
#define SCALAR double
|
||
|
|
#endif
|
||
|
|
|
||
|
|
using Scalar = SCALAR;
|
||
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||
|
|
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
// Helpers
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
|
||
|
|
static Mat make_spd(Index n) {
|
||
|
|
Mat M = Mat::Random(n, n);
|
||
|
|
return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
|
||
|
|
}
|
||
|
|
|
||
|
|
// CUDA warm-up: ensure the GPU is initialized before timing.
|
||
|
|
static void cuda_warmup() {
|
||
|
|
static bool done = false;
|
||
|
|
if (!done) {
|
||
|
|
void* p;
|
||
|
|
cudaMalloc(&p, 1);
|
||
|
|
cudaFree(p);
|
||
|
|
done = true;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
// GpuLLT benchmarks
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
|
||
|
|
// Factorize from host matrix (includes H2D upload).
|
||
|
|
static void BM_GpuLLT_Compute_Host(benchmark::State& state) {
|
||
|
|
cuda_warmup();
|
||
|
|
const Index n = state.range(0);
|
||
|
|
Mat A = make_spd(n);
|
||
|
|
GpuLLT<Scalar> llt;
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
llt.compute(A);
|
||
|
|
if (llt.info() != Success) state.SkipWithError("factorization failed");
|
||
|
|
}
|
||
|
|
|
||
|
|
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||
|
|
state.counters["GFLOPS"] =
|
||
|
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||
|
|
state.counters["n"] = n;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Factorize from DeviceMatrix (D2D copy path).
|
||
|
|
static void BM_GpuLLT_Compute_Device(benchmark::State& state) {
|
||
|
|
cuda_warmup();
|
||
|
|
const Index n = state.range(0);
|
||
|
|
Mat A = make_spd(n);
|
||
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||
|
|
GpuLLT<Scalar> llt;
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
llt.compute(d_A);
|
||
|
|
if (llt.info() != Success) state.SkipWithError("factorization failed");
|
||
|
|
}
|
||
|
|
|
||
|
|
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||
|
|
state.counters["GFLOPS"] =
|
||
|
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||
|
|
state.counters["n"] = n;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Factorize from DeviceMatrix (move path, no copy).
|
||
|
|
static void BM_GpuLLT_Compute_DeviceMove(benchmark::State& state) {
|
||
|
|
cuda_warmup();
|
||
|
|
const Index n = state.range(0);
|
||
|
|
Mat A = make_spd(n);
|
||
|
|
GpuLLT<Scalar> llt;
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||
|
|
llt.compute(std::move(d_A));
|
||
|
|
if (llt.info() != Success) state.SkipWithError("factorization failed");
|
||
|
|
}
|
||
|
|
|
||
|
|
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||
|
|
state.counters["GFLOPS"] =
|
||
|
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||
|
|
state.counters["n"] = n;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Solve from host matrix (H2D + potrs + D2H).
|
||
|
|
static void BM_GpuLLT_Solve_Host(benchmark::State& state) {
|
||
|
|
cuda_warmup();
|
||
|
|
const Index n = state.range(0);
|
||
|
|
const Index nrhs = state.range(1);
|
||
|
|
Mat A = make_spd(n);
|
||
|
|
Mat B = Mat::Random(n, nrhs);
|
||
|
|
GpuLLT<Scalar> llt(A);
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
Mat X = llt.solve(B);
|
||
|
|
benchmark::DoNotOptimize(X.data());
|
||
|
|
}
|
||
|
|
|
||
|
|
state.counters["n"] = n;
|
||
|
|
state.counters["nrhs"] = nrhs;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Solve from DeviceMatrix (D2D + potrs, async, toHost at end).
|
||
|
|
static void BM_GpuLLT_Solve_Device(benchmark::State& state) {
|
||
|
|
cuda_warmup();
|
||
|
|
const Index n = state.range(0);
|
||
|
|
const Index nrhs = state.range(1);
|
||
|
|
Mat A = make_spd(n);
|
||
|
|
Mat B = Mat::Random(n, nrhs);
|
||
|
|
GpuLLT<Scalar> llt(A);
|
||
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||
|
|
Mat X = d_X.toHost();
|
||
|
|
benchmark::DoNotOptimize(X.data());
|
||
|
|
}
|
||
|
|
|
||
|
|
state.counters["n"] = n;
|
||
|
|
state.counters["nrhs"] = nrhs;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Solve staying entirely on device (no toHost — measures pure GPU time).
|
||
|
|
static void BM_GpuLLT_Solve_DeviceOnly(benchmark::State& state) {
|
||
|
|
cuda_warmup();
|
||
|
|
const Index n = state.range(0);
|
||
|
|
const Index nrhs = state.range(1);
|
||
|
|
Mat A = make_spd(n);
|
||
|
|
Mat B = Mat::Random(n, nrhs);
|
||
|
|
GpuLLT<Scalar> llt(A);
|
||
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
DeviceMatrix<Scalar> d_X = llt.solve(d_B);
|
||
|
|
// Force completion without D2H transfer.
|
||
|
|
cudaStreamSynchronize(llt.stream());
|
||
|
|
benchmark::DoNotOptimize(d_X.data());
|
||
|
|
}
|
||
|
|
|
||
|
|
state.counters["n"] = n;
|
||
|
|
state.counters["nrhs"] = nrhs;
|
||
|
|
}
|
||
|
|
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
// GpuLU benchmarks
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
|
||
|
|
static void BM_GpuLU_Compute_Host(benchmark::State& state) {
|
||
|
|
cuda_warmup();
|
||
|
|
const Index n = state.range(0);
|
||
|
|
Mat A = Mat::Random(n, n);
|
||
|
|
GpuLU<Scalar> lu;
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
lu.compute(A);
|
||
|
|
if (lu.info() != Success) state.SkipWithError("factorization failed");
|
||
|
|
}
|
||
|
|
|
||
|
|
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
|
||
|
|
state.counters["GFLOPS"] =
|
||
|
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||
|
|
state.counters["n"] = n;
|
||
|
|
}
|
||
|
|
|
||
|
|
static void BM_GpuLU_Compute_Device(benchmark::State& state) {
|
||
|
|
cuda_warmup();
|
||
|
|
const Index n = state.range(0);
|
||
|
|
Mat A = Mat::Random(n, n);
|
||
|
|
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
|
||
|
|
GpuLU<Scalar> lu;
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
lu.compute(d_A);
|
||
|
|
if (lu.info() != Success) state.SkipWithError("factorization failed");
|
||
|
|
}
|
||
|
|
|
||
|
|
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
|
||
|
|
state.counters["GFLOPS"] =
|
||
|
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||
|
|
state.counters["n"] = n;
|
||
|
|
}
|
||
|
|
|
||
|
|
static void BM_GpuLU_Solve_Host(benchmark::State& state) {
|
||
|
|
cuda_warmup();
|
||
|
|
const Index n = state.range(0);
|
||
|
|
const Index nrhs = state.range(1);
|
||
|
|
Mat A = Mat::Random(n, n);
|
||
|
|
Mat B = Mat::Random(n, nrhs);
|
||
|
|
GpuLU<Scalar> lu(A);
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
Mat X = lu.solve(B);
|
||
|
|
benchmark::DoNotOptimize(X.data());
|
||
|
|
}
|
||
|
|
|
||
|
|
state.counters["n"] = n;
|
||
|
|
state.counters["nrhs"] = nrhs;
|
||
|
|
}
|
||
|
|
|
||
|
|
static void BM_GpuLU_Solve_Device(benchmark::State& state) {
|
||
|
|
cuda_warmup();
|
||
|
|
const Index n = state.range(0);
|
||
|
|
const Index nrhs = state.range(1);
|
||
|
|
Mat A = Mat::Random(n, n);
|
||
|
|
Mat B = Mat::Random(n, nrhs);
|
||
|
|
GpuLU<Scalar> lu(A);
|
||
|
|
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
DeviceMatrix<Scalar> d_X = lu.solve(d_B);
|
||
|
|
Mat X = d_X.toHost();
|
||
|
|
benchmark::DoNotOptimize(X.data());
|
||
|
|
}
|
||
|
|
|
||
|
|
state.counters["n"] = n;
|
||
|
|
state.counters["nrhs"] = nrhs;
|
||
|
|
}
|
||
|
|
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
// CPU baselines for comparison
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
|
||
|
|
static void BM_CpuLLT_Compute(benchmark::State& state) {
|
||
|
|
const Index n = state.range(0);
|
||
|
|
Mat A = make_spd(n);
|
||
|
|
LLT<Mat> llt;
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
llt.compute(A);
|
||
|
|
benchmark::DoNotOptimize(llt.matrixLLT().data());
|
||
|
|
}
|
||
|
|
|
||
|
|
double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
|
||
|
|
state.counters["GFLOPS"] =
|
||
|
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||
|
|
state.counters["n"] = n;
|
||
|
|
}
|
||
|
|
|
||
|
|
static void BM_CpuLU_Compute(benchmark::State& state) {
|
||
|
|
const Index n = state.range(0);
|
||
|
|
Mat A = Mat::Random(n, n);
|
||
|
|
PartialPivLU<Mat> lu;
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
lu.compute(A);
|
||
|
|
benchmark::DoNotOptimize(lu.matrixLU().data());
|
||
|
|
}
|
||
|
|
|
||
|
|
double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
|
||
|
|
state.counters["GFLOPS"] =
|
||
|
|
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
|
||
|
|
state.counters["n"] = n;
|
||
|
|
}
|
||
|
|
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
// Registration
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
|
||
|
|
// clang-format off
|
||
|
|
BENCHMARK(BM_GpuLLT_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||
|
|
BENCHMARK(BM_GpuLLT_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||
|
|
BENCHMARK(BM_GpuLLT_Compute_DeviceMove)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||
|
|
BENCHMARK(BM_GpuLLT_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||
|
|
BENCHMARK(BM_GpuLLT_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||
|
|
BENCHMARK(BM_GpuLLT_Solve_DeviceOnly)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||
|
|
|
||
|
|
BENCHMARK(BM_GpuLU_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||
|
|
BENCHMARK(BM_GpuLU_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||
|
|
BENCHMARK(BM_GpuLU_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||
|
|
BENCHMARK(BM_GpuLU_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
|
||
|
|
|
||
|
|
BENCHMARK(BM_CpuLLT_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||
|
|
BENCHMARK(BM_CpuLU_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
|
||
|
|
// clang-format on
|