eigen/benchmarks/GPU/bench_gpu_ba.cu

// Bundle Adjustment benchmark: GPU CG vs CPU CG on real BAL datasets.
//
// Tests Eigen's GPU CG pipeline (DeviceMatrix + GpuSparseContext + DeviceScalar)
// on the normal equations (J^T*J) arising from bundle adjustment problems.
//
// Reads a BAL (Bundle Adjustment in the Large) format file, computes the
// Jacobian and residual, forms the normal equations H = J^T*J + lambda*I,
// then solves H*dx = -J^T*r with both CPU and GPU conjugate gradients.
//
// BAL format: http://grail.cs.washington.edu/projects/bal/
//
// Usage:
//   cmake --build build-bench-gpu --target bench_gpu_ba
//
//   # Download a BAL dataset (bz2-compressed):
//   wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-49-7776-pre.txt.bz2
//   bunzip2 problem-49-7776-pre.txt.bz2
//
//   # Run on a specific problem:
//   BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba
//
//   # Append results to the log:
//   BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba \
//     --benchmark_format=console 2>&1 | tee -a benchmarks/GPU/ba_results.log

#include <benchmark/benchmark.h>

#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
#include <Eigen/GPU>

#include <cmath>
#include <cstdio>
#include <fstream>
#include <string>
#include <vector>

using namespace Eigen;

// ============================================================================
// BAL problem data
// ============================================================================

struct BALProblem {
  int num_cameras = 0;
  int num_points = 0;
  int num_observations = 0;

  // Observations: (camera_idx, point_idx, observed_x, observed_y).
  std::vector<int> camera_index;
  std::vector<int> point_index;
  std::vector<double> observations_x;
  std::vector<double> observations_y;

  // Camera parameters: 9 per camera (Rodrigues r[3], translation t[3], f, k1, k2).
  std::vector<double> cameras;  // [num_cameras * 9]

  // 3D points: 3 per point.
  std::vector<double> points;  // [num_points * 3]

  const double* camera(int i) const { return &cameras[i * 9]; }
  const double* point(int i) const { return &points[i * 3]; }

  bool load(const std::string& filename) {
    std::ifstream in(filename);
    if (!in) {
      fprintf(stderr, "ERROR: Cannot open BAL file: %s\n", filename.c_str());
      return false;
    }

    in >> num_cameras >> num_points >> num_observations;
    if (!in || num_cameras <= 0 || num_points <= 0 || num_observations <= 0) {
      fprintf(stderr, "ERROR: Invalid BAL header in %s\n", filename.c_str());
      return false;
    }

    camera_index.resize(num_observations);
    point_index.resize(num_observations);
    observations_x.resize(num_observations);
    observations_y.resize(num_observations);
    for (int i = 0; i < num_observations; ++i) {
      in >> camera_index[i] >> point_index[i] >> observations_x[i] >> observations_y[i];
    }

    cameras.resize(num_cameras * 9);
    for (int i = 0; i < num_cameras * 9; ++i) {
      in >> cameras[i];
    }

    points.resize(num_points * 3);
    for (int i = 0; i < num_points * 3; ++i) {
      in >> points[i];
    }

    if (!in) {
      fprintf(stderr, "ERROR: Truncated BAL file: %s\n", filename.c_str());
      return false;
    }

    fprintf(stderr, "Loaded BAL: %d cameras, %d points, %d observations\n", num_cameras, num_points, num_observations);
    return true;
  }
};

// ============================================================================
// Camera projection model (BAL convention)
// ============================================================================

// Rodrigues rotation: rotate point X by axis-angle vector omega.
static void rodrigues_rotate(const double* omega, const double* X, double* result) {
  double theta2 = omega[0] * omega[0] + omega[1] * omega[1] + omega[2] * omega[2];
  if (theta2 > 1e-30) {
    double theta = std::sqrt(theta2);
    double costh = std::cos(theta);
    double sinth = std::sin(theta);
    double k = (1.0 - costh) / theta2;

    // Cross product omega x X.
    double wx = omega[1] * X[2] - omega[2] * X[1];
    double wy = omega[2] * X[0] - omega[0] * X[2];
    double wz = omega[0] * X[1] - omega[1] * X[0];

    // Dot product omega . X.
    double dot = omega[0] * X[0] + omega[1] * X[1] + omega[2] * X[2];

    result[0] = X[0] * costh + wx * (sinth / theta) + omega[0] * dot * k;
    result[1] = X[1] * costh + wy * (sinth / theta) + omega[1] * dot * k;
    result[2] = X[2] * costh + wz * (sinth / theta) + omega[2] * dot * k;
  } else {
    // Small angle: R ≈ I + [omega]×.
    result[0] = X[0] + omega[1] * X[2] - omega[2] * X[1];
    result[1] = X[1] + omega[2] * X[0] - omega[0] * X[2];
    result[2] = X[2] + omega[0] * X[1] - omega[1] * X[0];
  }
}

// Project a 3D point through a camera, returning the 2D residual.
// camera: [r0,r1,r2, t0,t1,t2, f, k1, k2]
// point:  [X, Y, Z]
// observed: [ox, oy]
// residual: [rx, ry] = projected - observed
static void project(const double* camera, const double* point, const double* observed, double* residual) {
  // Rotate.
  double P[3];
  rodrigues_rotate(camera, point, P);

  // Translate.
  P[0] += camera[3];
  P[1] += camera[4];
  P[2] += camera[5];

  // Normalize (BAL convention: negative z).
  double xp = -P[0] / P[2];
  double yp = -P[1] / P[2];

  // Radial distortion.
  double r2 = xp * xp + yp * yp;
  double distortion = 1.0 + camera[7] * r2 + camera[8] * r2 * r2;

  // Apply focal length.
  double predicted_x = camera[6] * distortion * xp;
  double predicted_y = camera[6] * distortion * yp;

  residual[0] = predicted_x - observed[0];
  residual[1] = predicted_y - observed[1];
}

// ============================================================================
// Jacobian computation (numerical differentiation)
// ============================================================================

// Compute the 2x9 Jacobian block w.r.t. camera params and 2x3 block w.r.t.
// point coords for a single observation, using central finite differences.
static void compute_jacobian_block(const double* camera, const double* point, const double* observed,
                                   double* J_cam,    // 2x9, row-major
                                   double* J_point)  // 2x3, row-major
{
  constexpr double eps = 1e-8;

  // Camera parameters (9).
  double cam_pert[9];
  std::copy(camera, camera + 9, cam_pert);
  for (int j = 0; j < 9; ++j) {
    double orig = cam_pert[j];
    double rp[2], rm[2];

    cam_pert[j] = orig + eps;
    project(cam_pert, point, observed, rp);
    cam_pert[j] = orig - eps;
    project(cam_pert, point, observed, rm);
    cam_pert[j] = orig;

    J_cam[0 * 9 + j] = (rp[0] - rm[0]) / (2.0 * eps);
    J_cam[1 * 9 + j] = (rp[1] - rm[1]) / (2.0 * eps);
  }

  // Point coordinates (3).
  double pt_pert[3];
  std::copy(point, point + 3, pt_pert);
  for (int j = 0; j < 3; ++j) {
    double orig = pt_pert[j];
    double rp[2], rm[2];

    pt_pert[j] = orig + eps;
    project(camera, pt_pert, observed, rp);
    pt_pert[j] = orig - eps;
    project(camera, pt_pert, observed, rm);
    pt_pert[j] = orig;

    J_point[0 * 3 + j] = (rp[0] - rm[0]) / (2.0 * eps);
    J_point[1 * 3 + j] = (rp[1] - rm[1]) / (2.0 * eps);
  }
}

// ============================================================================
// Build normal equations: H = J^T*J + lambda*I, g = -J^T*r
// ============================================================================

struct NormalEquations {
  SparseMatrix<double, ColMajor, int> H;
  VectorXd g;
  VectorXd residual;
  double residual_norm;
  int jacobian_rows;
  int jacobian_cols;
  long jacobian_nnz;
};

static NormalEquations build_normal_equations(const BALProblem& problem, double lambda = 1.0) {
  const int num_cam_params = problem.num_cameras * 9;
  const int num_pt_params = problem.num_points * 3;
  const int num_params = num_cam_params + num_pt_params;
  const int num_residuals = problem.num_observations * 2;

  fprintf(stderr, "Building Jacobian: %d x %d, %ld nonzeros\n", num_residuals, num_params,
          (long)problem.num_observations * 24);

  // Build J as a triplet list.
  using Triplet = Eigen::Triplet<double>;
  std::vector<Triplet> triplets;
  triplets.reserve(problem.num_observations * 24);  // 2 rows × 12 nonzeros = 24 entries per obs

  VectorXd residual(num_residuals);

  for (int obs = 0; obs < problem.num_observations; ++obs) {
    int ci = problem.camera_index[obs];
    int pi = problem.point_index[obs];
    double observed[2] = {problem.observations_x[obs], problem.observations_y[obs]};

    // Compute residual.
    double r[2];
    project(problem.camera(ci), problem.point(pi), observed, r);
    residual[obs * 2 + 0] = r[0];
    residual[obs * 2 + 1] = r[1];

    // Compute Jacobian blocks.
    double J_cam[18], J_pt[6];  // 2x9 and 2x3
    compute_jacobian_block(problem.camera(ci), problem.point(pi), observed, J_cam, J_pt);

    // Insert camera block: rows [2*obs, 2*obs+1], cols [9*ci, 9*ci+8].
    for (int row = 0; row < 2; ++row) {
      for (int col = 0; col < 9; ++col) {
        double val = J_cam[row * 9 + col];
        if (val != 0.0) {
          triplets.emplace_back(obs * 2 + row, ci * 9 + col, val);
        }
      }
    }

    // Insert point block: rows [2*obs, 2*obs+1], cols [num_cam_params + 3*pi, ...].
    for (int row = 0; row < 2; ++row) {
      for (int col = 0; col < 3; ++col) {
        double val = J_pt[row * 3 + col];
        if (val != 0.0) {
          triplets.emplace_back(obs * 2 + row, num_cam_params + pi * 3 + col, val);
        }
      }
    }
  }

  // Build sparse Jacobian.
  SparseMatrix<double, ColMajor, int> J(num_residuals, num_params);
  J.setFromTriplets(triplets.begin(), triplets.end());

  fprintf(stderr, "Jacobian: %dx%d, nnz=%ld\n", (int)J.rows(), (int)J.cols(), (long)J.nonZeros());

  // Form normal equations: H = J^T*J + lambda*I.
  SparseMatrix<double, ColMajor, int> H = (J.transpose() * J).pruned();

  // Add Levenberg-Marquardt damping.
  for (int i = 0; i < num_params; ++i) {
    H.coeffRef(i, i) += lambda;
  }
  H.makeCompressed();

  // Gradient: g = -J^T * r.
  VectorXd g = -(J.transpose() * residual);

  double rnorm = residual.norm();
  fprintf(stderr, "Normal equations: H is %dx%d, nnz=%ld, |r|=%.6e\n", (int)H.rows(), (int)H.cols(), (long)H.nonZeros(),
          rnorm);

  return {std::move(H), std::move(g), std::move(residual), rnorm, num_residuals, num_params, (long)J.nonZeros()};
}

// ============================================================================
// Global problem state (loaded once before benchmarks run)
// ============================================================================

static BALProblem g_problem;
static NormalEquations g_neq;
static bool g_loaded = false;

static void ensure_loaded() {
  if (g_loaded) return;

  const char* bal_file = std::getenv("BAL_FILE");
  if (!bal_file) {
    fprintf(stderr,
            "ERROR: Set BAL_FILE environment variable to a BAL problem file.\n"
            "  Download from: http://grail.cs.washington.edu/projects/bal/\n"
            "  Example:\n"
            "    wget http://grail.cs.washington.edu/projects/bal/data/ladybug/"
            "problem-49-7776-pre.txt.bz2\n"
            "    bunzip2 problem-49-7776-pre.txt.bz2\n"
            "    BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba\n");
    std::exit(1);
  }

  if (!g_problem.load(bal_file)) {
    std::exit(1);
  }

  g_neq = build_normal_equations(g_problem);
  g_loaded = true;
}

// ============================================================================
// CPU CG benchmark
// ============================================================================

static void BM_BA_CPU_CG(benchmark::State& state) {
  ensure_loaded();
  const auto& H = g_neq.H;
  const auto& g = g_neq.g;

  ConjugateGradient<SparseMatrix<double, ColMajor, int>, Lower | Upper> cg;
  cg.setMaxIterations(10000);
  cg.setTolerance(1e-8);
  cg.compute(H);

  int last_iters = 0;
  double last_error = 0;
  for (auto _ : state) {
    VectorXd dx = cg.solve(g);
    benchmark::DoNotOptimize(dx.data());
    last_iters = cg.iterations();
    last_error = cg.error();
  }

  state.counters["n"] = H.rows();
  state.counters["nnz"] = H.nonZeros();
  state.counters["iters"] = last_iters;
  state.counters["error"] = last_error;
  state.counters["cameras"] = g_problem.num_cameras;
  state.counters["points"] = g_problem.num_points;
  state.counters["observations"] = g_problem.num_observations;
}

// ============================================================================
// GPU CG benchmark (with Jacobi preconditioner)
// ============================================================================

static void cuda_warmup() {
  static bool done = false;
  if (!done) {
    void* p;
    cudaMalloc(&p, 1);
    cudaFree(p);
    done = true;
  }
}

static void BM_BA_GPU_CG(benchmark::State& state) {
  ensure_loaded();
  cuda_warmup();

  const auto& H = g_neq.H;
  const auto& g = g_neq.g;
  const Index n = H.rows();

  // Extract inverse diagonal (Jacobi preconditioner).
  using SpMat = SparseMatrix<double, ColMajor, int>;
  VectorXd invdiag(n);
  for (Index j = 0; j < H.outerSize(); ++j) {
    SpMat::InnerIterator it(H, j);
    while (it && it.index() != j) ++it;
    if (it && it.index() == j && it.value() != 0.0)
      invdiag(j) = 1.0 / it.value();
    else
      invdiag(j) = 1.0;
  }

  // Set up GPU context and upload data.
  GpuContext ctx;
  GpuContext::setThreadLocal(&ctx);
  GpuSparseContext<double> spmv_ctx(ctx);
  auto mat = spmv_ctx.deviceView(H);
  auto d_invdiag = DeviceMatrix<double>::fromHost(invdiag, ctx.stream());
  auto d_g = DeviceMatrix<double>::fromHost(g, ctx.stream());

  int last_iters = 0;
  double last_error = 0;

  for (auto _ : state) {
    DeviceMatrix<double> d_x(n, 1);
    d_x.setZero(ctx);
    DeviceMatrix<double> residual(n, 1);
    residual.copyFrom(ctx, d_g);

    double rhsNorm2 = d_g.squaredNorm(ctx);
    double threshold = 1e-8 * 1e-8 * rhsNorm2;
    double residualNorm2 = residual.squaredNorm(ctx);

    DeviceMatrix<double> p = d_invdiag.cwiseProduct(ctx, residual);
    DeviceMatrix<double> z(n, 1), tmp(n, 1);

    auto absNew = residual.dot(ctx, p);
    Index i = 0;
    Index maxIters = 10000;
    while (i < maxIters) {
      tmp.noalias() = mat * p;
      auto alpha = absNew / p.dot(ctx, tmp);
      d_x += alpha * p;
      residual -= alpha * tmp;

      residualNorm2 = residual.squaredNorm(ctx);
      if (residualNorm2 < threshold) break;

      z.cwiseProduct(ctx, d_invdiag, residual);  // in-place, no allocation
      auto absOld = std::move(absNew);
      absNew = residual.dot(ctx, z);
      auto beta = absNew / absOld;

      p *= beta;  // device-pointer scal, no host sync
      p += z;
      i++;
    }
    benchmark::DoNotOptimize(d_x.data());
    last_iters = i;
    last_error = std::sqrt(residualNorm2 / rhsNorm2);
  }

  GpuContext::setThreadLocal(nullptr);

  state.counters["n"] = n;
  state.counters["nnz"] = H.nonZeros();
  state.counters["iters"] = last_iters;
  state.counters["error"] = last_error;
  state.counters["cameras"] = g_problem.num_cameras;
  state.counters["points"] = g_problem.num_points;
  state.counters["observations"] = g_problem.num_observations;
}

// ============================================================================
// CPU CG with Jacobi preconditioner (apples-to-apples comparison)
// ============================================================================

static void BM_BA_CPU_CG_Jacobi(benchmark::State& state) {
  ensure_loaded();
  const auto& H = g_neq.H;
  const auto& g = g_neq.g;

  // Eigen's DiagonalPreconditioner is effectively Jacobi.
  ConjugateGradient<SparseMatrix<double, ColMajor, int>, Lower | Upper> cg;
  cg.setMaxIterations(10000);
  cg.setTolerance(1e-8);
  cg.compute(H);

  int last_iters = 0;
  double last_error = 0;
  for (auto _ : state) {
    VectorXd dx = cg.solve(g);
    benchmark::DoNotOptimize(dx.data());
    last_iters = cg.iterations();
    last_error = cg.error();
  }

  state.counters["n"] = H.rows();
  state.counters["nnz"] = H.nonZeros();
  state.counters["iters"] = last_iters;
  state.counters["error"] = last_error;
}

// ============================================================================
// Register benchmarks
// ============================================================================

BENCHMARK(BM_BA_CPU_CG)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_BA_CPU_CG_Jacobi)->Unit(benchmark::kMillisecond);
BENCHMARK(BM_BA_GPU_CG)->Unit(benchmark::kMillisecond);

// ============================================================================
// Custom main: print summary after benchmarks
// ============================================================================

int main(int argc, char** argv) {
  benchmark::Initialize(&argc, argv);

  // Print problem info before benchmarks.
  const char* bal_file = std::getenv("BAL_FILE");
  if (bal_file) {
    ensure_loaded();
    fprintf(stderr,
            "\n"
            "=== Bundle Adjustment GPU CG Benchmark ===\n"
            "BAL file:      %s\n"
            "Cameras:       %d\n"
            "Points:        %d\n"
            "Observations:  %d\n"
            "J size:        %d x %d, nnz=%ld\n"
            "H size:        %d x %d, nnz=%ld\n"
            "|residual|:    %.6e\n"
            "==========================================\n\n",
            bal_file, g_problem.num_cameras, g_problem.num_points, g_problem.num_observations, g_neq.jacobian_rows,
            g_neq.jacobian_cols, g_neq.jacobian_nnz, (int)g_neq.H.rows(), (int)g_neq.H.cols(), (long)g_neq.H.nonZeros(),
            g_neq.residual_norm);
  }

  benchmark::RunSpecifiedBenchmarks();
  benchmark::Shutdown();
  return 0;
}