// GPU batching benchmarks: multi-stream concurrency for many small solves. // // Each GpuLLT/GpuLU owns its own CUDA stream. This benchmark measures how // well multiple solver instances overlap on the GPU, which is critical for // workloads like robotics (many small systems) and SLAM (batched poses). // // Compares: // 1. Sequential: one solver handles all systems one by one // 2. Batched: N solvers on N streams, all launched before any sync // 3. CPU baseline: Eigen LLT on host // // For Nsight Systems: batched mode should show overlapping kernels on // different streams in the timeline view. // // nsys profile --trace=cuda ./bench_gpu_batching #include #include #include #include #include using namespace Eigen; #ifndef SCALAR #define SCALAR double #endif using Scalar = SCALAR; using Mat = Matrix; static Mat make_spd(Index n) { Mat M = Mat::Random(n, n); return M.adjoint() * M + Mat::Identity(n, n) * static_cast(n); } static void cuda_warmup() { static bool done = false; if (!done) { void* p; cudaMalloc(&p, 1); cudaFree(p); done = true; } } // -------------------------------------------------------------------------- // Sequential: one solver, N systems solved one after another // -------------------------------------------------------------------------- static void BM_Batch_Sequential(benchmark::State& state) { cuda_warmup(); const Index n = state.range(0); const int batch_size = static_cast(state.range(1)); // Pre-generate all SPD matrices and RHS vectors. std::vector As(batch_size); std::vector Bs(batch_size); for (int i = 0; i < batch_size; ++i) { As[i] = make_spd(n); Bs[i] = Mat::Random(n, 1); } GpuLLT llt; for (auto _ : state) { std::vector results(batch_size); for (int i = 0; i < batch_size; ++i) { llt.compute(As[i]); results[i] = llt.solve(Bs[i]); } benchmark::DoNotOptimize(results.back().data()); } state.counters["n"] = n; state.counters["batch"] = batch_size; state.counters["total_solves"] = batch_size; } // -------------------------------------------------------------------------- // Sequential with DeviceMatrix (avoid re-upload of A each iteration) // -------------------------------------------------------------------------- static void BM_Batch_Sequential_Device(benchmark::State& state) { cuda_warmup(); const Index n = state.range(0); const int batch_size = static_cast(state.range(1)); std::vector As(batch_size); std::vector Bs(batch_size); std::vector> d_As(batch_size); std::vector> d_Bs(batch_size); for (int i = 0; i < batch_size; ++i) { As[i] = make_spd(n); Bs[i] = Mat::Random(n, 1); d_As[i] = DeviceMatrix::fromHost(As[i]); d_Bs[i] = DeviceMatrix::fromHost(Bs[i]); } GpuLLT llt; for (auto _ : state) { std::vector results(batch_size); for (int i = 0; i < batch_size; ++i) { llt.compute(d_As[i]); DeviceMatrix d_X = llt.solve(d_Bs[i]); results[i] = d_X.toHost(); } benchmark::DoNotOptimize(results.back().data()); } state.counters["n"] = n; state.counters["batch"] = batch_size; state.counters["total_solves"] = batch_size; } // -------------------------------------------------------------------------- // Batched: N solvers on N streams, overlapping execution // -------------------------------------------------------------------------- static void BM_Batch_MultiStream(benchmark::State& state) { cuda_warmup(); const Index n = state.range(0); const int batch_size = static_cast(state.range(1)); std::vector As(batch_size); std::vector Bs(batch_size); std::vector> d_As(batch_size); std::vector> d_Bs(batch_size); for (int i = 0; i < batch_size; ++i) { As[i] = make_spd(n); Bs[i] = Mat::Random(n, 1); d_As[i] = DeviceMatrix::fromHost(As[i]); d_Bs[i] = DeviceMatrix::fromHost(Bs[i]); } // N solvers = N independent CUDA streams. std::vector>> solvers(batch_size); for (int i = 0; i < batch_size; ++i) { solvers[i] = std::make_unique>(); } for (auto _ : state) { // Phase 1: launch all factorizations (async, different streams). for (int i = 0; i < batch_size; ++i) { solvers[i]->compute(d_As[i]); } // Phase 2: launch all solves (async, different streams). std::vector> d_Xs(batch_size); for (int i = 0; i < batch_size; ++i) { d_Xs[i] = solvers[i]->solve(d_Bs[i]); } // Phase 3: download all results. std::vector results(batch_size); for (int i = 0; i < batch_size; ++i) { results[i] = d_Xs[i].toHost(); } benchmark::DoNotOptimize(results.back().data()); } state.counters["n"] = n; state.counters["batch"] = batch_size; state.counters["streams"] = batch_size; state.counters["total_solves"] = batch_size; } // -------------------------------------------------------------------------- // Batched with async download (overlap D2H with computation) // -------------------------------------------------------------------------- static void BM_Batch_MultiStream_AsyncDownload(benchmark::State& state) { cuda_warmup(); const Index n = state.range(0); const int batch_size = static_cast(state.range(1)); std::vector As(batch_size); std::vector Bs(batch_size); std::vector> d_As(batch_size); std::vector> d_Bs(batch_size); for (int i = 0; i < batch_size; ++i) { As[i] = make_spd(n); Bs[i] = Mat::Random(n, 1); d_As[i] = DeviceMatrix::fromHost(As[i]); d_Bs[i] = DeviceMatrix::fromHost(Bs[i]); } std::vector>> solvers(batch_size); for (int i = 0; i < batch_size; ++i) { solvers[i] = std::make_unique>(); } for (auto _ : state) { // Launch all compute + solve. std::vector> d_Xs(batch_size); for (int i = 0; i < batch_size; ++i) { solvers[i]->compute(d_As[i]); d_Xs[i] = solvers[i]->solve(d_Bs[i]); } // Enqueue all async downloads. std::vector> transfers; transfers.reserve(batch_size); for (int i = 0; i < batch_size; ++i) { transfers.push_back(d_Xs[i].toHostAsync()); } // Collect all results. for (int i = 0; i < batch_size; ++i) { benchmark::DoNotOptimize(transfers[i].get().data()); } } state.counters["n"] = n; state.counters["batch"] = batch_size; state.counters["streams"] = batch_size; state.counters["total_solves"] = batch_size; } // -------------------------------------------------------------------------- // CPU baseline: Eigen LLT on host, sequential // -------------------------------------------------------------------------- static void BM_Batch_CPU(benchmark::State& state) { const Index n = state.range(0); const int batch_size = static_cast(state.range(1)); std::vector As(batch_size); std::vector Bs(batch_size); for (int i = 0; i < batch_size; ++i) { As[i] = make_spd(n); Bs[i] = Mat::Random(n, 1); } for (auto _ : state) { std::vector results(batch_size); for (int i = 0; i < batch_size; ++i) { LLT llt(As[i]); results[i] = llt.solve(Bs[i]); } benchmark::DoNotOptimize(results.back().data()); } state.counters["n"] = n; state.counters["batch"] = batch_size; state.counters["total_solves"] = batch_size; } // -------------------------------------------------------------------------- // Registration // -------------------------------------------------------------------------- // clang-format off // Args: {matrix_size, batch_size} // Small matrices with large batches are the interesting case for multi-stream. BENCHMARK(BM_Batch_Sequential)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_Batch_Sequential_Device)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_Batch_CPU)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond); // Also run larger sizes with moderate batching. BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond); // clang-format on