// GPU solver benchmarks: GpuLLT and GpuLU compute + solve throughput. // // Measures factorization and solve performance for the host-matrix and // DeviceMatrix code paths across a range of matrix sizes. // // For Nsight Systems profiling: // nsys profile --trace=cuda,nvtx ./bench_gpu_solvers // // For Nsight Compute kernel analysis: // ncu --set full -o profile ./bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096 #include #include #include #include using namespace Eigen; #ifndef SCALAR #define SCALAR double #endif using Scalar = SCALAR; using Mat = Matrix; // -------------------------------------------------------------------------- // Helpers // -------------------------------------------------------------------------- static Mat make_spd(Index n) { Mat M = Mat::Random(n, n); return M.adjoint() * M + Mat::Identity(n, n) * static_cast(n); } // CUDA warm-up: ensure the GPU is initialized before timing. static void cuda_warmup() { static bool done = false; if (!done) { void* p; cudaMalloc(&p, 1); cudaFree(p); done = true; } } // -------------------------------------------------------------------------- // GpuLLT benchmarks // -------------------------------------------------------------------------- // Factorize from host matrix (includes H2D upload). static void BM_GpuLLT_Compute_Host(benchmark::State& state) { cuda_warmup(); const Index n = state.range(0); Mat A = make_spd(n); GpuLLT llt; for (auto _ : state) { llt.compute(A); if (llt.info() != Success) state.SkipWithError("factorization failed"); } double flops = static_cast(n) * static_cast(n) * static_cast(n) / 3.0; state.counters["GFLOPS"] = benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); state.counters["n"] = n; } // Factorize from DeviceMatrix (D2D copy path). static void BM_GpuLLT_Compute_Device(benchmark::State& state) { cuda_warmup(); const Index n = state.range(0); Mat A = make_spd(n); auto d_A = DeviceMatrix::fromHost(A); GpuLLT llt; for (auto _ : state) { llt.compute(d_A); if (llt.info() != Success) state.SkipWithError("factorization failed"); } double flops = static_cast(n) * static_cast(n) * static_cast(n) / 3.0; state.counters["GFLOPS"] = benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); state.counters["n"] = n; } // Factorize from DeviceMatrix (move path, no copy). static void BM_GpuLLT_Compute_DeviceMove(benchmark::State& state) { cuda_warmup(); const Index n = state.range(0); Mat A = make_spd(n); GpuLLT llt; for (auto _ : state) { auto d_A = DeviceMatrix::fromHost(A); llt.compute(std::move(d_A)); if (llt.info() != Success) state.SkipWithError("factorization failed"); } double flops = static_cast(n) * static_cast(n) * static_cast(n) / 3.0; state.counters["GFLOPS"] = benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); state.counters["n"] = n; } // Solve from host matrix (H2D + potrs + D2H). static void BM_GpuLLT_Solve_Host(benchmark::State& state) { cuda_warmup(); const Index n = state.range(0); const Index nrhs = state.range(1); Mat A = make_spd(n); Mat B = Mat::Random(n, nrhs); GpuLLT llt(A); for (auto _ : state) { Mat X = llt.solve(B); benchmark::DoNotOptimize(X.data()); } state.counters["n"] = n; state.counters["nrhs"] = nrhs; } // Solve from DeviceMatrix (D2D + potrs, async, toHost at end). static void BM_GpuLLT_Solve_Device(benchmark::State& state) { cuda_warmup(); const Index n = state.range(0); const Index nrhs = state.range(1); Mat A = make_spd(n); Mat B = Mat::Random(n, nrhs); GpuLLT llt(A); auto d_B = DeviceMatrix::fromHost(B); for (auto _ : state) { DeviceMatrix d_X = llt.solve(d_B); Mat X = d_X.toHost(); benchmark::DoNotOptimize(X.data()); } state.counters["n"] = n; state.counters["nrhs"] = nrhs; } // Solve staying entirely on device (no toHost — measures pure GPU time). static void BM_GpuLLT_Solve_DeviceOnly(benchmark::State& state) { cuda_warmup(); const Index n = state.range(0); const Index nrhs = state.range(1); Mat A = make_spd(n); Mat B = Mat::Random(n, nrhs); GpuLLT llt(A); auto d_B = DeviceMatrix::fromHost(B); for (auto _ : state) { DeviceMatrix d_X = llt.solve(d_B); // Force completion without D2H transfer. cudaStreamSynchronize(llt.stream()); benchmark::DoNotOptimize(d_X.data()); } state.counters["n"] = n; state.counters["nrhs"] = nrhs; } // -------------------------------------------------------------------------- // GpuLU benchmarks // -------------------------------------------------------------------------- static void BM_GpuLU_Compute_Host(benchmark::State& state) { cuda_warmup(); const Index n = state.range(0); Mat A = Mat::Random(n, n); GpuLU lu; for (auto _ : state) { lu.compute(A); if (lu.info() != Success) state.SkipWithError("factorization failed"); } double flops = 2.0 / 3.0 * static_cast(n) * static_cast(n) * static_cast(n); state.counters["GFLOPS"] = benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); state.counters["n"] = n; } static void BM_GpuLU_Compute_Device(benchmark::State& state) { cuda_warmup(); const Index n = state.range(0); Mat A = Mat::Random(n, n); auto d_A = DeviceMatrix::fromHost(A); GpuLU lu; for (auto _ : state) { lu.compute(d_A); if (lu.info() != Success) state.SkipWithError("factorization failed"); } double flops = 2.0 / 3.0 * static_cast(n) * static_cast(n) * static_cast(n); state.counters["GFLOPS"] = benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); state.counters["n"] = n; } static void BM_GpuLU_Solve_Host(benchmark::State& state) { cuda_warmup(); const Index n = state.range(0); const Index nrhs = state.range(1); Mat A = Mat::Random(n, n); Mat B = Mat::Random(n, nrhs); GpuLU lu(A); for (auto _ : state) { Mat X = lu.solve(B); benchmark::DoNotOptimize(X.data()); } state.counters["n"] = n; state.counters["nrhs"] = nrhs; } static void BM_GpuLU_Solve_Device(benchmark::State& state) { cuda_warmup(); const Index n = state.range(0); const Index nrhs = state.range(1); Mat A = Mat::Random(n, n); Mat B = Mat::Random(n, nrhs); GpuLU lu(A); auto d_B = DeviceMatrix::fromHost(B); for (auto _ : state) { DeviceMatrix d_X = lu.solve(d_B); Mat X = d_X.toHost(); benchmark::DoNotOptimize(X.data()); } state.counters["n"] = n; state.counters["nrhs"] = nrhs; } // -------------------------------------------------------------------------- // CPU baselines for comparison // -------------------------------------------------------------------------- static void BM_CpuLLT_Compute(benchmark::State& state) { const Index n = state.range(0); Mat A = make_spd(n); LLT llt; for (auto _ : state) { llt.compute(A); benchmark::DoNotOptimize(llt.matrixLLT().data()); } double flops = static_cast(n) * static_cast(n) * static_cast(n) / 3.0; state.counters["GFLOPS"] = benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); state.counters["n"] = n; } static void BM_CpuLU_Compute(benchmark::State& state) { const Index n = state.range(0); Mat A = Mat::Random(n, n); PartialPivLU lu; for (auto _ : state) { lu.compute(A); benchmark::DoNotOptimize(lu.matrixLU().data()); } double flops = 2.0 / 3.0 * static_cast(n) * static_cast(n) * static_cast(n); state.counters["GFLOPS"] = benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); state.counters["n"] = n; } // -------------------------------------------------------------------------- // Registration // -------------------------------------------------------------------------- // clang-format off BENCHMARK(BM_GpuLLT_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_GpuLLT_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_GpuLLT_Compute_DeviceMove)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_GpuLLT_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_GpuLLT_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_GpuLLT_Solve_DeviceOnly)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_GpuLU_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_GpuLU_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_GpuLU_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_GpuLU_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_CpuLLT_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond); BENCHMARK(BM_CpuLU_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond); // clang-format on