mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
269 lines
8.8 KiB
C++
269 lines
8.8 KiB
C++
|
|
// GPU batching benchmarks: multi-stream concurrency for many small solves.
|
||
|
|
//
|
||
|
|
// Each GpuLLT/GpuLU owns its own CUDA stream. This benchmark measures how
|
||
|
|
// well multiple solver instances overlap on the GPU, which is critical for
|
||
|
|
// workloads like robotics (many small systems) and SLAM (batched poses).
|
||
|
|
//
|
||
|
|
// Compares:
|
||
|
|
// 1. Sequential: one solver handles all systems one by one
|
||
|
|
// 2. Batched: N solvers on N streams, all launched before any sync
|
||
|
|
// 3. CPU baseline: Eigen LLT on host
|
||
|
|
//
|
||
|
|
// For Nsight Systems: batched mode should show overlapping kernels on
|
||
|
|
// different streams in the timeline view.
|
||
|
|
//
|
||
|
|
// nsys profile --trace=cuda ./bench_gpu_batching
|
||
|
|
|
||
|
|
#include <benchmark/benchmark.h>
|
||
|
|
|
||
|
|
#include <Eigen/Cholesky>
|
||
|
|
#include <Eigen/GPU>
|
||
|
|
|
||
|
|
#include <memory>
|
||
|
|
#include <vector>
|
||
|
|
|
||
|
|
using namespace Eigen;
|
||
|
|
|
||
|
|
#ifndef SCALAR
|
||
|
|
#define SCALAR double
|
||
|
|
#endif
|
||
|
|
|
||
|
|
using Scalar = SCALAR;
|
||
|
|
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
|
||
|
|
|
||
|
|
static Mat make_spd(Index n) {
|
||
|
|
Mat M = Mat::Random(n, n);
|
||
|
|
return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void cuda_warmup() {
|
||
|
|
static bool done = false;
|
||
|
|
if (!done) {
|
||
|
|
void* p;
|
||
|
|
cudaMalloc(&p, 1);
|
||
|
|
cudaFree(p);
|
||
|
|
done = true;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
// Sequential: one solver, N systems solved one after another
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
|
||
|
|
static void BM_Batch_Sequential(benchmark::State& state) {
|
||
|
|
cuda_warmup();
|
||
|
|
const Index n = state.range(0);
|
||
|
|
const int batch_size = static_cast<int>(state.range(1));
|
||
|
|
|
||
|
|
// Pre-generate all SPD matrices and RHS vectors.
|
||
|
|
std::vector<Mat> As(batch_size);
|
||
|
|
std::vector<Mat> Bs(batch_size);
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
As[i] = make_spd(n);
|
||
|
|
Bs[i] = Mat::Random(n, 1);
|
||
|
|
}
|
||
|
|
|
||
|
|
GpuLLT<Scalar> llt;
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
std::vector<Mat> results(batch_size);
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
llt.compute(As[i]);
|
||
|
|
results[i] = llt.solve(Bs[i]);
|
||
|
|
}
|
||
|
|
benchmark::DoNotOptimize(results.back().data());
|
||
|
|
}
|
||
|
|
|
||
|
|
state.counters["n"] = n;
|
||
|
|
state.counters["batch"] = batch_size;
|
||
|
|
state.counters["total_solves"] = batch_size;
|
||
|
|
}
|
||
|
|
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
// Sequential with DeviceMatrix (avoid re-upload of A each iteration)
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
|
||
|
|
static void BM_Batch_Sequential_Device(benchmark::State& state) {
|
||
|
|
cuda_warmup();
|
||
|
|
const Index n = state.range(0);
|
||
|
|
const int batch_size = static_cast<int>(state.range(1));
|
||
|
|
|
||
|
|
std::vector<Mat> As(batch_size);
|
||
|
|
std::vector<Mat> Bs(batch_size);
|
||
|
|
std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
|
||
|
|
std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
As[i] = make_spd(n);
|
||
|
|
Bs[i] = Mat::Random(n, 1);
|
||
|
|
d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
|
||
|
|
d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
|
||
|
|
}
|
||
|
|
|
||
|
|
GpuLLT<Scalar> llt;
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
std::vector<Mat> results(batch_size);
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
llt.compute(d_As[i]);
|
||
|
|
DeviceMatrix<Scalar> d_X = llt.solve(d_Bs[i]);
|
||
|
|
results[i] = d_X.toHost();
|
||
|
|
}
|
||
|
|
benchmark::DoNotOptimize(results.back().data());
|
||
|
|
}
|
||
|
|
|
||
|
|
state.counters["n"] = n;
|
||
|
|
state.counters["batch"] = batch_size;
|
||
|
|
state.counters["total_solves"] = batch_size;
|
||
|
|
}
|
||
|
|
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
// Batched: N solvers on N streams, overlapping execution
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
|
||
|
|
static void BM_Batch_MultiStream(benchmark::State& state) {
|
||
|
|
cuda_warmup();
|
||
|
|
const Index n = state.range(0);
|
||
|
|
const int batch_size = static_cast<int>(state.range(1));
|
||
|
|
|
||
|
|
std::vector<Mat> As(batch_size);
|
||
|
|
std::vector<Mat> Bs(batch_size);
|
||
|
|
std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
|
||
|
|
std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
As[i] = make_spd(n);
|
||
|
|
Bs[i] = Mat::Random(n, 1);
|
||
|
|
d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
|
||
|
|
d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
|
||
|
|
}
|
||
|
|
|
||
|
|
// N solvers = N independent CUDA streams.
|
||
|
|
std::vector<std::unique_ptr<GpuLLT<Scalar>>> solvers(batch_size);
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
solvers[i] = std::make_unique<GpuLLT<Scalar>>();
|
||
|
|
}
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
// Phase 1: launch all factorizations (async, different streams).
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
solvers[i]->compute(d_As[i]);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Phase 2: launch all solves (async, different streams).
|
||
|
|
std::vector<DeviceMatrix<Scalar>> d_Xs(batch_size);
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
d_Xs[i] = solvers[i]->solve(d_Bs[i]);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Phase 3: download all results.
|
||
|
|
std::vector<Mat> results(batch_size);
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
results[i] = d_Xs[i].toHost();
|
||
|
|
}
|
||
|
|
benchmark::DoNotOptimize(results.back().data());
|
||
|
|
}
|
||
|
|
|
||
|
|
state.counters["n"] = n;
|
||
|
|
state.counters["batch"] = batch_size;
|
||
|
|
state.counters["streams"] = batch_size;
|
||
|
|
state.counters["total_solves"] = batch_size;
|
||
|
|
}
|
||
|
|
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
// Batched with async download (overlap D2H with computation)
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
|
||
|
|
static void BM_Batch_MultiStream_AsyncDownload(benchmark::State& state) {
|
||
|
|
cuda_warmup();
|
||
|
|
const Index n = state.range(0);
|
||
|
|
const int batch_size = static_cast<int>(state.range(1));
|
||
|
|
|
||
|
|
std::vector<Mat> As(batch_size);
|
||
|
|
std::vector<Mat> Bs(batch_size);
|
||
|
|
std::vector<DeviceMatrix<Scalar>> d_As(batch_size);
|
||
|
|
std::vector<DeviceMatrix<Scalar>> d_Bs(batch_size);
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
As[i] = make_spd(n);
|
||
|
|
Bs[i] = Mat::Random(n, 1);
|
||
|
|
d_As[i] = DeviceMatrix<Scalar>::fromHost(As[i]);
|
||
|
|
d_Bs[i] = DeviceMatrix<Scalar>::fromHost(Bs[i]);
|
||
|
|
}
|
||
|
|
|
||
|
|
std::vector<std::unique_ptr<GpuLLT<Scalar>>> solvers(batch_size);
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
solvers[i] = std::make_unique<GpuLLT<Scalar>>();
|
||
|
|
}
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
// Launch all compute + solve.
|
||
|
|
std::vector<DeviceMatrix<Scalar>> d_Xs(batch_size);
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
solvers[i]->compute(d_As[i]);
|
||
|
|
d_Xs[i] = solvers[i]->solve(d_Bs[i]);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Enqueue all async downloads.
|
||
|
|
std::vector<HostTransfer<Scalar>> transfers;
|
||
|
|
transfers.reserve(batch_size);
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
transfers.push_back(d_Xs[i].toHostAsync());
|
||
|
|
}
|
||
|
|
|
||
|
|
// Collect all results.
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
benchmark::DoNotOptimize(transfers[i].get().data());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
state.counters["n"] = n;
|
||
|
|
state.counters["batch"] = batch_size;
|
||
|
|
state.counters["streams"] = batch_size;
|
||
|
|
state.counters["total_solves"] = batch_size;
|
||
|
|
}
|
||
|
|
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
// CPU baseline: Eigen LLT on host, sequential
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
|
||
|
|
static void BM_Batch_CPU(benchmark::State& state) {
|
||
|
|
const Index n = state.range(0);
|
||
|
|
const int batch_size = static_cast<int>(state.range(1));
|
||
|
|
|
||
|
|
std::vector<Mat> As(batch_size);
|
||
|
|
std::vector<Mat> Bs(batch_size);
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
As[i] = make_spd(n);
|
||
|
|
Bs[i] = Mat::Random(n, 1);
|
||
|
|
}
|
||
|
|
|
||
|
|
for (auto _ : state) {
|
||
|
|
std::vector<Mat> results(batch_size);
|
||
|
|
for (int i = 0; i < batch_size; ++i) {
|
||
|
|
LLT<Mat> llt(As[i]);
|
||
|
|
results[i] = llt.solve(Bs[i]);
|
||
|
|
}
|
||
|
|
benchmark::DoNotOptimize(results.back().data());
|
||
|
|
}
|
||
|
|
|
||
|
|
state.counters["n"] = n;
|
||
|
|
state.counters["batch"] = batch_size;
|
||
|
|
state.counters["total_solves"] = batch_size;
|
||
|
|
}
|
||
|
|
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
// Registration
|
||
|
|
// --------------------------------------------------------------------------
|
||
|
|
|
||
|
|
// clang-format off
|
||
|
|
// Args: {matrix_size, batch_size}
|
||
|
|
// Small matrices with large batches are the interesting case for multi-stream.
|
||
|
|
BENCHMARK(BM_Batch_Sequential)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||
|
|
BENCHMARK(BM_Batch_Sequential_Device)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||
|
|
BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||
|
|
BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||
|
|
BENCHMARK(BM_Batch_CPU)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
|
||
|
|
|
||
|
|
// Also run larger sizes with moderate batching.
|
||
|
|
BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||
|
|
BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond);
|
||
|
|
// clang-format on
|