eigen/test/gpu_cg.cpp

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

// End-to-end test: CG algorithm running on GPU via DeviceMatrix.
//
// Uses DeviceSparseView for SpMV, DeviceMatrix for vectors, DeviceScalar
// for deferred reductions. Verifies correctness against CPU ConjugateGradient.

#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
#include <Eigen/GPU>

using namespace Eigen;

// ---- Helper: build a sparse SPD matrix --------------------------------------

template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  SpMat R(n, n);
  R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
  for (Index j = 0; j < n; ++j) {
    for (Index i = 0; i < n; ++i) {
      if (i == j || (std::rand() / double(RAND_MAX)) < density) {
        R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
      }
    }
  }
  R.makeCompressed();
  SpMat A = R.adjoint() * R;
  for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
  A.makeCompressed();
  return A;
}

// ---- GPU CG without preconditioner ------------------------------------------

template <typename Scalar>
void test_gpu_cg(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  SpMat A = make_spd<Scalar>(n);
  Vec b = Vec::Random(n);

  // CPU reference (identity preconditioner to match GPU).
  ConjugateGradient<SpMat, Lower | Upper, IdentityPreconditioner> cpu_cg;
  cpu_cg.setMaxIterations(1000);
  cpu_cg.setTolerance(RealScalar(1e-8));
  cpu_cg.compute(A);
  Vec x_cpu = cpu_cg.solve(b);
  VERIFY_IS_EQUAL(cpu_cg.info(), Success);

  // GPU CG: mirrors Eigen's conjugate_gradient() using DeviceMatrix ops.
  GpuContext ctx;
  GpuContext::setThreadLocal(&ctx);
  GpuSparseContext<Scalar> spmv_ctx(ctx);
  auto mat = spmv_ctx.deviceView(A);

  auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
  DeviceMatrix<Scalar> d_x(n, 1);
  d_x.setZero(ctx);

  // r = b (since x=0)
  DeviceMatrix<Scalar> residual(n, 1);
  residual.copyFrom(ctx, d_b);

  RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
  RealScalar tol = RealScalar(1e-8);
  RealScalar threshold = tol * tol * rhsNorm2;
  RealScalar residualNorm2 = residual.squaredNorm(ctx);

  // p = r (no preconditioner)
  DeviceMatrix<Scalar> p(n, 1);
  p.copyFrom(ctx, residual);
  DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);

  auto absNew = residual.dot(ctx, p);
  Index maxIters = 1000;
  Index i = 0;
  while (i < maxIters) {
    tmp.noalias() = mat * p;

    auto alpha = absNew / p.dot(ctx, tmp);
    d_x += alpha * p;
    residual -= alpha * tmp;

    residualNorm2 = residual.squaredNorm(ctx);
    if (residualNorm2 < threshold) break;

    // z = r (no preconditioner)
    z.copyFrom(ctx, residual);

    auto absOld = std::move(absNew);
    absNew = residual.dot(ctx, z);
    auto beta = absNew / absOld;

    p *= Scalar(beta);
    p += z;
    i++;
  }

  GpuContext::setThreadLocal(nullptr);

  Vec x_gpu = d_x.toHost(ctx.stream());

  // Verify residual.
  Vec r = A * x_gpu - b;
  RealScalar relres = r.norm() / b.norm();
  VERIFY(relres < RealScalar(1e-6));

  // Compare with CPU.
  RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
}

// ---- GPU CG with Jacobi preconditioner --------------------------------------

template <typename Scalar>
void test_gpu_cg_jacobi(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  SpMat A = make_spd<Scalar>(n);
  Vec b = Vec::Random(n);

  // CPU reference.
  ConjugateGradient<SpMat, Lower | Upper> cpu_cg;
  cpu_cg.setMaxIterations(1000);
  cpu_cg.setTolerance(RealScalar(1e-8));
  cpu_cg.compute(A);
  Vec x_cpu = cpu_cg.solve(b);

  // Extract inverse diagonal.
  Vec invdiag(n);
  for (Index j = 0; j < A.outerSize(); ++j) {
    typename SpMat::InnerIterator it(A, j);
    while (it && it.index() != j) ++it;
    if (it && it.index() == j && it.value() != Scalar(0))
      invdiag(j) = Scalar(1) / it.value();
    else
      invdiag(j) = Scalar(1);
  }

  // GPU CG with Jacobi preconditioner.
  GpuContext ctx;
  GpuContext::setThreadLocal(&ctx);
  GpuSparseContext<Scalar> spmv_ctx(ctx);
  auto mat = spmv_ctx.deviceView(A);
  auto d_invdiag = DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());

  auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
  DeviceMatrix<Scalar> d_x(n, 1);
  d_x.setZero(ctx);

  DeviceMatrix<Scalar> residual(n, 1);
  residual.copyFrom(ctx, d_b);

  RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
  RealScalar tol = RealScalar(1e-8);
  RealScalar threshold = tol * tol * rhsNorm2;
  RealScalar residualNorm2 = residual.squaredNorm(ctx);

  // p = precond.solve(r) = invdiag .* r
  DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
  DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);

  auto absNew = residual.dot(ctx, p);
  Index maxIters = 1000;
  Index i = 0;
  while (i < maxIters) {
    tmp.noalias() = mat * p;

    auto alpha = absNew / p.dot(ctx, tmp);
    d_x += alpha * p;
    residual -= alpha * tmp;

    residualNorm2 = residual.squaredNorm(ctx);
    if (residualNorm2 < threshold) break;

    // z = precond.solve(r) = invdiag .* r
    z.cwiseProduct(ctx, d_invdiag, residual);

    auto absOld = std::move(absNew);
    absNew = residual.dot(ctx, z);
    auto beta = absNew / absOld;

    p *= beta;
    p += z;
    i++;
  }

  GpuContext::setThreadLocal(nullptr);

  Vec x_gpu = d_x.toHost(ctx.stream());

  Vec r = A * x_gpu - b;
  RealScalar relres = r.norm() / b.norm();
  VERIFY(relres < RealScalar(1e-6));

  RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
}

EIGEN_DECLARE_TEST(gpu_cg) {
  CALL_SUBTEST(test_gpu_cg<double>(64));
  CALL_SUBTEST(test_gpu_cg<double>(256));
  CALL_SUBTEST(test_gpu_cg<float>(64));
  CALL_SUBTEST(test_gpu_cg_jacobi<double>(64));
  CALL_SUBTEST(test_gpu_cg_jacobi<double>(256));
  CALL_SUBTEST(test_gpu_cg_jacobi<float>(64));
}