Files
eigen/benchmarks/GPU/bench_gpu_ba.cu
Rasmus Munk Larsen 014f12f11a GPU: Add BLAS-1 ops, DeviceScalar, device-resident SpMV, and CG interop (5/5)
Add the operator interface needed for GPU iterative solvers:

- BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(),
  noalias(), operator+=/-=/\*= dispatching to cuBLAS axpy/scal/dot/nrm2.
- DeviceScalar<Scalar>: device-resident scalar returned by reductions.
  Defers host sync until value is read (implicit conversion). Device-side
  division via NPP for real types.
- GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(),
  cusparseHandle().
- GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm
  selection and plan caching.
- GpuSparseContext: GpuContext& constructor for same-stream execution,
  deviceView() returning DeviceSparseView with operator* for device-resident
  SpMV (d_y = d_A * d_x).
- geam expressions: d_C = d_A + alpha * d_B via cublasXgeam.
- GpuSVD::matrixV() convenience wrapper.

These additions make DeviceMatrix usable as a VectorType in Eigen algorithm
templates. Conjugate gradient is the motivating example and is tested against
CPU ConjugateGradient for correctness.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 20:19:59 -07:00

534 lines
17 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Bundle Adjustment benchmark: GPU CG vs CPU CG on real BAL datasets.
//
// Tests Eigen's GPU CG pipeline (DeviceMatrix + GpuSparseContext + DeviceScalar)
// on the normal equations (J^T*J) arising from bundle adjustment problems.
//
// Reads a BAL (Bundle Adjustment in the Large) format file, computes the
// Jacobian and residual, forms the normal equations H = J^T*J + lambda*I,
// then solves H*dx = -J^T*r with both CPU and GPU conjugate gradients.
//
// BAL format: http://grail.cs.washington.edu/projects/bal/
//
// Usage:
// cmake --build build-bench-gpu --target bench_gpu_ba
//
// # Download a BAL dataset (bz2-compressed):
// wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-49-7776-pre.txt.bz2
// bunzip2 problem-49-7776-pre.txt.bz2
//
// # Run on a specific problem:
// BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba
//
// # Append results to the log:
// BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba \
// --benchmark_format=console 2>&1 | tee -a benchmarks/GPU/ba_results.log
#include <benchmark/benchmark.h>
#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
#include <Eigen/GPU>
#include <cmath>
#include <cstdio>
#include <fstream>
#include <string>
#include <vector>
using namespace Eigen;
// ============================================================================
// BAL problem data
// ============================================================================
struct BALProblem {
int num_cameras = 0;
int num_points = 0;
int num_observations = 0;
// Observations: (camera_idx, point_idx, observed_x, observed_y).
std::vector<int> camera_index;
std::vector<int> point_index;
std::vector<double> observations_x;
std::vector<double> observations_y;
// Camera parameters: 9 per camera (Rodrigues r[3], translation t[3], f, k1, k2).
std::vector<double> cameras; // [num_cameras * 9]
// 3D points: 3 per point.
std::vector<double> points; // [num_points * 3]
const double* camera(int i) const { return &cameras[i * 9]; }
const double* point(int i) const { return &points[i * 3]; }
bool load(const std::string& filename) {
std::ifstream in(filename);
if (!in) {
fprintf(stderr, "ERROR: Cannot open BAL file: %s\n", filename.c_str());
return false;
}
in >> num_cameras >> num_points >> num_observations;
if (!in || num_cameras <= 0 || num_points <= 0 || num_observations <= 0) {
fprintf(stderr, "ERROR: Invalid BAL header in %s\n", filename.c_str());
return false;
}
camera_index.resize(num_observations);
point_index.resize(num_observations);
observations_x.resize(num_observations);
observations_y.resize(num_observations);
for (int i = 0; i < num_observations; ++i) {
in >> camera_index[i] >> point_index[i] >> observations_x[i] >> observations_y[i];
}
cameras.resize(num_cameras * 9);
for (int i = 0; i < num_cameras * 9; ++i) {
in >> cameras[i];
}
points.resize(num_points * 3);
for (int i = 0; i < num_points * 3; ++i) {
in >> points[i];
}
if (!in) {
fprintf(stderr, "ERROR: Truncated BAL file: %s\n", filename.c_str());
return false;
}
fprintf(stderr, "Loaded BAL: %d cameras, %d points, %d observations\n", num_cameras, num_points, num_observations);
return true;
}
};
// ============================================================================
// Camera projection model (BAL convention)
// ============================================================================
// Rodrigues rotation: rotate point X by axis-angle vector omega.
static void rodrigues_rotate(const double* omega, const double* X, double* result) {
double theta2 = omega[0] * omega[0] + omega[1] * omega[1] + omega[2] * omega[2];
if (theta2 > 1e-30) {
double theta = std::sqrt(theta2);
double costh = std::cos(theta);
double sinth = std::sin(theta);
double k = (1.0 - costh) / theta2;
// Cross product omega x X.
double wx = omega[1] * X[2] - omega[2] * X[1];
double wy = omega[2] * X[0] - omega[0] * X[2];
double wz = omega[0] * X[1] - omega[1] * X[0];
// Dot product omega . X.
double dot = omega[0] * X[0] + omega[1] * X[1] + omega[2] * X[2];
result[0] = X[0] * costh + wx * (sinth / theta) + omega[0] * dot * k;
result[1] = X[1] * costh + wy * (sinth / theta) + omega[1] * dot * k;
result[2] = X[2] * costh + wz * (sinth / theta) + omega[2] * dot * k;
} else {
// Small angle: R ≈ I + [omega]×.
result[0] = X[0] + omega[1] * X[2] - omega[2] * X[1];
result[1] = X[1] + omega[2] * X[0] - omega[0] * X[2];
result[2] = X[2] + omega[0] * X[1] - omega[1] * X[0];
}
}
// Project a 3D point through a camera, returning the 2D residual.
// camera: [r0,r1,r2, t0,t1,t2, f, k1, k2]
// point: [X, Y, Z]
// observed: [ox, oy]
// residual: [rx, ry] = projected - observed
static void project(const double* camera, const double* point, const double* observed, double* residual) {
// Rotate.
double P[3];
rodrigues_rotate(camera, point, P);
// Translate.
P[0] += camera[3];
P[1] += camera[4];
P[2] += camera[5];
// Normalize (BAL convention: negative z).
double xp = -P[0] / P[2];
double yp = -P[1] / P[2];
// Radial distortion.
double r2 = xp * xp + yp * yp;
double distortion = 1.0 + camera[7] * r2 + camera[8] * r2 * r2;
// Apply focal length.
double predicted_x = camera[6] * distortion * xp;
double predicted_y = camera[6] * distortion * yp;
residual[0] = predicted_x - observed[0];
residual[1] = predicted_y - observed[1];
}
// ============================================================================
// Jacobian computation (numerical differentiation)
// ============================================================================
// Compute the 2x9 Jacobian block w.r.t. camera params and 2x3 block w.r.t.
// point coords for a single observation, using central finite differences.
static void compute_jacobian_block(const double* camera, const double* point, const double* observed,
double* J_cam, // 2x9, row-major
double* J_point) // 2x3, row-major
{
constexpr double eps = 1e-8;
// Camera parameters (9).
double cam_pert[9];
std::copy(camera, camera + 9, cam_pert);
for (int j = 0; j < 9; ++j) {
double orig = cam_pert[j];
double rp[2], rm[2];
cam_pert[j] = orig + eps;
project(cam_pert, point, observed, rp);
cam_pert[j] = orig - eps;
project(cam_pert, point, observed, rm);
cam_pert[j] = orig;
J_cam[0 * 9 + j] = (rp[0] - rm[0]) / (2.0 * eps);
J_cam[1 * 9 + j] = (rp[1] - rm[1]) / (2.0 * eps);
}
// Point coordinates (3).
double pt_pert[3];
std::copy(point, point + 3, pt_pert);
for (int j = 0; j < 3; ++j) {
double orig = pt_pert[j];
double rp[2], rm[2];
pt_pert[j] = orig + eps;
project(camera, pt_pert, observed, rp);
pt_pert[j] = orig - eps;
project(camera, pt_pert, observed, rm);
pt_pert[j] = orig;
J_point[0 * 3 + j] = (rp[0] - rm[0]) / (2.0 * eps);
J_point[1 * 3 + j] = (rp[1] - rm[1]) / (2.0 * eps);
}
}
// ============================================================================
// Build normal equations: H = J^T*J + lambda*I, g = -J^T*r
// ============================================================================
struct NormalEquations {
SparseMatrix<double, ColMajor, int> H;
VectorXd g;
VectorXd residual;
double residual_norm;
int jacobian_rows;
int jacobian_cols;
long jacobian_nnz;
};
static NormalEquations build_normal_equations(const BALProblem& problem, double lambda = 1.0) {
const int num_cam_params = problem.num_cameras * 9;
const int num_pt_params = problem.num_points * 3;
const int num_params = num_cam_params + num_pt_params;
const int num_residuals = problem.num_observations * 2;
fprintf(stderr, "Building Jacobian: %d x %d, %ld nonzeros\n", num_residuals, num_params,
(long)problem.num_observations * 24);
// Build J as a triplet list.
using Triplet = Eigen::Triplet<double>;
std::vector<Triplet> triplets;
triplets.reserve(problem.num_observations * 24); // 2 rows × 12 nonzeros = 24 entries per obs
VectorXd residual(num_residuals);
for (int obs = 0; obs < problem.num_observations; ++obs) {
int ci = problem.camera_index[obs];
int pi = problem.point_index[obs];
double observed[2] = {problem.observations_x[obs], problem.observations_y[obs]};
// Compute residual.
double r[2];
project(problem.camera(ci), problem.point(pi), observed, r);
residual[obs * 2 + 0] = r[0];
residual[obs * 2 + 1] = r[1];
// Compute Jacobian blocks.
double J_cam[18], J_pt[6]; // 2x9 and 2x3
compute_jacobian_block(problem.camera(ci), problem.point(pi), observed, J_cam, J_pt);
// Insert camera block: rows [2*obs, 2*obs+1], cols [9*ci, 9*ci+8].
for (int row = 0; row < 2; ++row) {
for (int col = 0; col < 9; ++col) {
double val = J_cam[row * 9 + col];
if (val != 0.0) {
triplets.emplace_back(obs * 2 + row, ci * 9 + col, val);
}
}
}
// Insert point block: rows [2*obs, 2*obs+1], cols [num_cam_params + 3*pi, ...].
for (int row = 0; row < 2; ++row) {
for (int col = 0; col < 3; ++col) {
double val = J_pt[row * 3 + col];
if (val != 0.0) {
triplets.emplace_back(obs * 2 + row, num_cam_params + pi * 3 + col, val);
}
}
}
}
// Build sparse Jacobian.
SparseMatrix<double, ColMajor, int> J(num_residuals, num_params);
J.setFromTriplets(triplets.begin(), triplets.end());
fprintf(stderr, "Jacobian: %dx%d, nnz=%ld\n", (int)J.rows(), (int)J.cols(), (long)J.nonZeros());
// Form normal equations: H = J^T*J + lambda*I.
SparseMatrix<double, ColMajor, int> H = (J.transpose() * J).pruned();
// Add Levenberg-Marquardt damping.
for (int i = 0; i < num_params; ++i) {
H.coeffRef(i, i) += lambda;
}
H.makeCompressed();
// Gradient: g = -J^T * r.
VectorXd g = -(J.transpose() * residual);
double rnorm = residual.norm();
fprintf(stderr, "Normal equations: H is %dx%d, nnz=%ld, |r|=%.6e\n", (int)H.rows(), (int)H.cols(), (long)H.nonZeros(),
rnorm);
return {std::move(H), std::move(g), std::move(residual), rnorm, num_residuals, num_params, (long)J.nonZeros()};
}
// ============================================================================
// Global problem state (loaded once before benchmarks run)
// ============================================================================
static BALProblem g_problem;
static NormalEquations g_neq;
static bool g_loaded = false;
static void ensure_loaded() {
if (g_loaded) return;
const char* bal_file = std::getenv("BAL_FILE");
if (!bal_file) {
fprintf(stderr,
"ERROR: Set BAL_FILE environment variable to a BAL problem file.\n"
" Download from: http://grail.cs.washington.edu/projects/bal/\n"
" Example:\n"
" wget http://grail.cs.washington.edu/projects/bal/data/ladybug/"
"problem-49-7776-pre.txt.bz2\n"
" bunzip2 problem-49-7776-pre.txt.bz2\n"
" BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba\n");
std::exit(1);
}
if (!g_problem.load(bal_file)) {
std::exit(1);
}
g_neq = build_normal_equations(g_problem);
g_loaded = true;
}
// ============================================================================
// CPU CG benchmark
// ============================================================================
static void BM_BA_CPU_CG(benchmark::State& state) {
ensure_loaded();
const auto& H = g_neq.H;
const auto& g = g_neq.g;
ConjugateGradient<SparseMatrix<double, ColMajor, int>, Lower | Upper> cg;
cg.setMaxIterations(10000);
cg.setTolerance(1e-8);
cg.compute(H);
int last_iters = 0;
double last_error = 0;
for (auto _ : state) {
VectorXd dx = cg.solve(g);
benchmark::DoNotOptimize(dx.data());
last_iters = cg.iterations();
last_error = cg.error();
}
state.counters["n"] = H.rows();
state.counters["nnz"] = H.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
state.counters["cameras"] = g_problem.num_cameras;
state.counters["points"] = g_problem.num_points;
state.counters["observations"] = g_problem.num_observations;
}
// ============================================================================
// GPU CG benchmark (with Jacobi preconditioner)
// ============================================================================
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 1);
cudaFree(p);
done = true;
}
}
static void BM_BA_GPU_CG(benchmark::State& state) {
ensure_loaded();
cuda_warmup();
const auto& H = g_neq.H;
const auto& g = g_neq.g;
const Index n = H.rows();
// Extract inverse diagonal (Jacobi preconditioner).
using SpMat = SparseMatrix<double, ColMajor, int>;
VectorXd invdiag(n);
for (Index j = 0; j < H.outerSize(); ++j) {
SpMat::InnerIterator it(H, j);
while (it && it.index() != j) ++it;
if (it && it.index() == j && it.value() != 0.0)
invdiag(j) = 1.0 / it.value();
else
invdiag(j) = 1.0;
}
// Set up GPU context and upload data.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<double> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(H);
auto d_invdiag = DeviceMatrix<double>::fromHost(invdiag, ctx.stream());
auto d_g = DeviceMatrix<double>::fromHost(g, ctx.stream());
int last_iters = 0;
double last_error = 0;
for (auto _ : state) {
DeviceMatrix<double> d_x(n, 1);
d_x.setZero(ctx);
DeviceMatrix<double> residual(n, 1);
residual.copyFrom(ctx, d_g);
double rhsNorm2 = d_g.squaredNorm(ctx);
double threshold = 1e-8 * 1e-8 * rhsNorm2;
double residualNorm2 = residual.squaredNorm(ctx);
DeviceMatrix<double> p = d_invdiag.cwiseProduct(ctx, residual);
DeviceMatrix<double> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index i = 0;
Index maxIters = 10000;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
z.cwiseProduct(ctx, d_invdiag, residual); // in-place, no allocation
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= beta; // device-pointer scal, no host sync
p += z;
i++;
}
benchmark::DoNotOptimize(d_x.data());
last_iters = i;
last_error = std::sqrt(residualNorm2 / rhsNorm2);
}
GpuContext::setThreadLocal(nullptr);
state.counters["n"] = n;
state.counters["nnz"] = H.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
state.counters["cameras"] = g_problem.num_cameras;
state.counters["points"] = g_problem.num_points;
state.counters["observations"] = g_problem.num_observations;
}
// ============================================================================
// CPU CG with Jacobi preconditioner (apples-to-apples comparison)
// ============================================================================
static void BM_BA_CPU_CG_Jacobi(benchmark::State& state) {
ensure_loaded();
const auto& H = g_neq.H;
const auto& g = g_neq.g;
// Eigen's DiagonalPreconditioner is effectively Jacobi.
ConjugateGradient<SparseMatrix<double, ColMajor, int>, Lower | Upper> cg;
cg.setMaxIterations(10000);
cg.setTolerance(1e-8);
cg.compute(H);
int last_iters = 0;
double last_error = 0;
for (auto _ : state) {
VectorXd dx = cg.solve(g);
benchmark::DoNotOptimize(dx.data());
last_iters = cg.iterations();
last_error = cg.error();
}
state.counters["n"] = H.rows();
state.counters["nnz"] = H.nonZeros();
state.counters["iters"] = last_iters;
state.counters["error"] = last_error;
}
// ============================================================================
// Register benchmarks
// ============================================================================
BENCHMARK(BM_BA_CPU_CG)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_BA_CPU_CG_Jacobi)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_BA_GPU_CG)->Unit(benchmark::kMillisecond);
// ============================================================================
// Custom main: print summary after benchmarks
// ============================================================================
int main(int argc, char** argv) {
benchmark::Initialize(&argc, argv);
// Print problem info before benchmarks.
const char* bal_file = std::getenv("BAL_FILE");
if (bal_file) {
ensure_loaded();
fprintf(stderr,
"\n"
"=== Bundle Adjustment GPU CG Benchmark ===\n"
"BAL file: %s\n"
"Cameras: %d\n"
"Points: %d\n"
"Observations: %d\n"
"J size: %d x %d, nnz=%ld\n"
"H size: %d x %d, nnz=%ld\n"
"|residual|: %.6e\n"
"==========================================\n\n",
bal_file, g_problem.num_cameras, g_problem.num_points, g_problem.num_observations, g_neq.jacobian_rows,
g_neq.jacobian_cols, g_neq.jacobian_nnz, (int)g_neq.H.rows(), (int)g_neq.H.cols(), (long)g_neq.H.nonZeros(),
g_neq.residual_norm);
}
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();
return 0;
}