// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2026 Rasmus Munk Larsen // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // End-to-end test: CG algorithm running on GPU via DeviceMatrix. // // Uses DeviceSparseView for SpMV, DeviceMatrix for vectors, DeviceScalar // for deferred reductions. Verifies correctness against CPU ConjugateGradient. #define EIGEN_USE_GPU #include "main.h" #include #include #include using namespace Eigen; // ---- Helper: build a sparse SPD matrix -------------------------------------- template SparseMatrix make_spd(Index n, double density = 0.1) { using SpMat = SparseMatrix; using RealScalar = typename NumTraits::Real; SpMat R(n, n); R.reserve(VectorXi::Constant(n, static_cast(n * density) + 1)); for (Index j = 0; j < n; ++j) { for (Index i = 0; i < n; ++i) { if (i == j || (std::rand() / double(RAND_MAX)) < density) { R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5); } } } R.makeCompressed(); SpMat A = R.adjoint() * R; for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n)); A.makeCompressed(); return A; } // ---- GPU CG without preconditioner ------------------------------------------ template void test_gpu_cg(Index n) { using SpMat = SparseMatrix; using Vec = Matrix; using RealScalar = typename NumTraits::Real; SpMat A = make_spd(n); Vec b = Vec::Random(n); // CPU reference (identity preconditioner to match GPU). ConjugateGradient cpu_cg; cpu_cg.setMaxIterations(1000); cpu_cg.setTolerance(RealScalar(1e-8)); cpu_cg.compute(A); Vec x_cpu = cpu_cg.solve(b); VERIFY_IS_EQUAL(cpu_cg.info(), Success); // GPU CG: mirrors Eigen's conjugate_gradient() using DeviceMatrix ops. GpuContext ctx; GpuContext::setThreadLocal(&ctx); GpuSparseContext spmv_ctx(ctx); auto mat = spmv_ctx.deviceView(A); auto d_b = DeviceMatrix::fromHost(b, ctx.stream()); DeviceMatrix d_x(n, 1); d_x.setZero(ctx); // r = b (since x=0) DeviceMatrix residual(n, 1); residual.copyFrom(ctx, d_b); RealScalar rhsNorm2 = d_b.squaredNorm(ctx); RealScalar tol = RealScalar(1e-8); RealScalar threshold = tol * tol * rhsNorm2; RealScalar residualNorm2 = residual.squaredNorm(ctx); // p = r (no preconditioner) DeviceMatrix p(n, 1); p.copyFrom(ctx, residual); DeviceMatrix z(n, 1), tmp(n, 1); auto absNew = residual.dot(ctx, p); Index maxIters = 1000; Index i = 0; while (i < maxIters) { tmp.noalias() = mat * p; auto alpha = absNew / p.dot(ctx, tmp); d_x += alpha * p; residual -= alpha * tmp; residualNorm2 = residual.squaredNorm(ctx); if (residualNorm2 < threshold) break; // z = r (no preconditioner) z.copyFrom(ctx, residual); auto absOld = std::move(absNew); absNew = residual.dot(ctx, z); auto beta = absNew / absOld; p *= Scalar(beta); p += z; i++; } GpuContext::setThreadLocal(nullptr); Vec x_gpu = d_x.toHost(ctx.stream()); // Verify residual. Vec r = A * x_gpu - b; RealScalar relres = r.norm() / b.norm(); VERIFY(relres < RealScalar(1e-6)); // Compare with CPU. RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits::epsilon(); VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol); } // ---- GPU CG with Jacobi preconditioner -------------------------------------- template void test_gpu_cg_jacobi(Index n) { using SpMat = SparseMatrix; using Vec = Matrix; using RealScalar = typename NumTraits::Real; SpMat A = make_spd(n); Vec b = Vec::Random(n); // CPU reference. ConjugateGradient cpu_cg; cpu_cg.setMaxIterations(1000); cpu_cg.setTolerance(RealScalar(1e-8)); cpu_cg.compute(A); Vec x_cpu = cpu_cg.solve(b); // Extract inverse diagonal. Vec invdiag(n); for (Index j = 0; j < A.outerSize(); ++j) { typename SpMat::InnerIterator it(A, j); while (it && it.index() != j) ++it; if (it && it.index() == j && it.value() != Scalar(0)) invdiag(j) = Scalar(1) / it.value(); else invdiag(j) = Scalar(1); } // GPU CG with Jacobi preconditioner. GpuContext ctx; GpuContext::setThreadLocal(&ctx); GpuSparseContext spmv_ctx(ctx); auto mat = spmv_ctx.deviceView(A); auto d_invdiag = DeviceMatrix::fromHost(invdiag, ctx.stream()); auto d_b = DeviceMatrix::fromHost(b, ctx.stream()); DeviceMatrix d_x(n, 1); d_x.setZero(ctx); DeviceMatrix residual(n, 1); residual.copyFrom(ctx, d_b); RealScalar rhsNorm2 = d_b.squaredNorm(ctx); RealScalar tol = RealScalar(1e-8); RealScalar threshold = tol * tol * rhsNorm2; RealScalar residualNorm2 = residual.squaredNorm(ctx); // p = precond.solve(r) = invdiag .* r DeviceMatrix p = d_invdiag.cwiseProduct(ctx, residual); DeviceMatrix z(n, 1), tmp(n, 1); auto absNew = residual.dot(ctx, p); Index maxIters = 1000; Index i = 0; while (i < maxIters) { tmp.noalias() = mat * p; auto alpha = absNew / p.dot(ctx, tmp); d_x += alpha * p; residual -= alpha * tmp; residualNorm2 = residual.squaredNorm(ctx); if (residualNorm2 < threshold) break; // z = precond.solve(r) = invdiag .* r z.cwiseProduct(ctx, d_invdiag, residual); auto absOld = std::move(absNew); absNew = residual.dot(ctx, z); auto beta = absNew / absOld; p *= beta; p += z; i++; } GpuContext::setThreadLocal(nullptr); Vec x_gpu = d_x.toHost(ctx.stream()); Vec r = A * x_gpu - b; RealScalar relres = r.norm() / b.norm(); VERIFY(relres < RealScalar(1e-6)); RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits::epsilon(); VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol); } EIGEN_DECLARE_TEST(gpu_cg) { CALL_SUBTEST(test_gpu_cg(64)); CALL_SUBTEST(test_gpu_cg(256)); CALL_SUBTEST(test_gpu_cg(64)); CALL_SUBTEST(test_gpu_cg_jacobi(64)); CALL_SUBTEST(test_gpu_cg_jacobi(256)); CALL_SUBTEST(test_gpu_cg_jacobi(64)); }