Files
eigen/test/gpu_cg.cpp
Rasmus Munk Larsen 014f12f11a GPU: Add BLAS-1 ops, DeviceScalar, device-resident SpMV, and CG interop (5/5)
Add the operator interface needed for GPU iterative solvers:

- BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(),
  noalias(), operator+=/-=/\*= dispatching to cuBLAS axpy/scal/dot/nrm2.
- DeviceScalar<Scalar>: device-resident scalar returned by reductions.
  Defers host sync until value is read (implicit conversion). Device-side
  division via NPP for real types.
- GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(),
  cusparseHandle().
- GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm
  selection and plan caching.
- GpuSparseContext: GpuContext& constructor for same-stream execution,
  deviceView() returning DeviceSparseView with operator* for device-resident
  SpMV (d_y = d_A * d_x).
- geam expressions: d_C = d_A + alpha * d_B via cublasXgeam.
- GpuSVD::matrixV() convenience wrapper.

These additions make DeviceMatrix usable as a VectorType in Eigen algorithm
templates. Conjugate gradient is the motivating example and is tested against
CPU ConjugateGradient for correctness.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 20:19:59 -07:00

225 lines
6.5 KiB
C++

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// End-to-end test: CG algorithm running on GPU via DeviceMatrix.
//
// Uses DeviceSparseView for SpMV, DeviceMatrix for vectors, DeviceScalar
// for deferred reductions. Verifies correctness against CPU ConjugateGradient.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/IterativeLinearSolvers>
#include <Eigen/GPU>
using namespace Eigen;
// ---- Helper: build a sparse SPD matrix --------------------------------------
template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat R(n, n);
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
for (Index j = 0; j < n; ++j) {
for (Index i = 0; i < n; ++i) {
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
}
}
}
R.makeCompressed();
SpMat A = R.adjoint() * R;
for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
A.makeCompressed();
return A;
}
// ---- GPU CG without preconditioner ------------------------------------------
template <typename Scalar>
void test_gpu_cg(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
// CPU reference (identity preconditioner to match GPU).
ConjugateGradient<SpMat, Lower | Upper, IdentityPreconditioner> cpu_cg;
cpu_cg.setMaxIterations(1000);
cpu_cg.setTolerance(RealScalar(1e-8));
cpu_cg.compute(A);
Vec x_cpu = cpu_cg.solve(b);
VERIFY_IS_EQUAL(cpu_cg.info(), Success);
// GPU CG: mirrors Eigen's conjugate_gradient() using DeviceMatrix ops.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<Scalar> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(A);
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
DeviceMatrix<Scalar> d_x(n, 1);
d_x.setZero(ctx);
// r = b (since x=0)
DeviceMatrix<Scalar> residual(n, 1);
residual.copyFrom(ctx, d_b);
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
RealScalar tol = RealScalar(1e-8);
RealScalar threshold = tol * tol * rhsNorm2;
RealScalar residualNorm2 = residual.squaredNorm(ctx);
// p = r (no preconditioner)
DeviceMatrix<Scalar> p(n, 1);
p.copyFrom(ctx, residual);
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index maxIters = 1000;
Index i = 0;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
// z = r (no preconditioner)
z.copyFrom(ctx, residual);
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= Scalar(beta);
p += z;
i++;
}
GpuContext::setThreadLocal(nullptr);
Vec x_gpu = d_x.toHost(ctx.stream());
// Verify residual.
Vec r = A * x_gpu - b;
RealScalar relres = r.norm() / b.norm();
VERIFY(relres < RealScalar(1e-6));
// Compare with CPU.
RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
}
// ---- GPU CG with Jacobi preconditioner --------------------------------------
template <typename Scalar>
void test_gpu_cg_jacobi(Index n) {
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
using Vec = Matrix<Scalar, Dynamic, 1>;
using RealScalar = typename NumTraits<Scalar>::Real;
SpMat A = make_spd<Scalar>(n);
Vec b = Vec::Random(n);
// CPU reference.
ConjugateGradient<SpMat, Lower | Upper> cpu_cg;
cpu_cg.setMaxIterations(1000);
cpu_cg.setTolerance(RealScalar(1e-8));
cpu_cg.compute(A);
Vec x_cpu = cpu_cg.solve(b);
// Extract inverse diagonal.
Vec invdiag(n);
for (Index j = 0; j < A.outerSize(); ++j) {
typename SpMat::InnerIterator it(A, j);
while (it && it.index() != j) ++it;
if (it && it.index() == j && it.value() != Scalar(0))
invdiag(j) = Scalar(1) / it.value();
else
invdiag(j) = Scalar(1);
}
// GPU CG with Jacobi preconditioner.
GpuContext ctx;
GpuContext::setThreadLocal(&ctx);
GpuSparseContext<Scalar> spmv_ctx(ctx);
auto mat = spmv_ctx.deviceView(A);
auto d_invdiag = DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
DeviceMatrix<Scalar> d_x(n, 1);
d_x.setZero(ctx);
DeviceMatrix<Scalar> residual(n, 1);
residual.copyFrom(ctx, d_b);
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
RealScalar tol = RealScalar(1e-8);
RealScalar threshold = tol * tol * rhsNorm2;
RealScalar residualNorm2 = residual.squaredNorm(ctx);
// p = precond.solve(r) = invdiag .* r
DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
auto absNew = residual.dot(ctx, p);
Index maxIters = 1000;
Index i = 0;
while (i < maxIters) {
tmp.noalias() = mat * p;
auto alpha = absNew / p.dot(ctx, tmp);
d_x += alpha * p;
residual -= alpha * tmp;
residualNorm2 = residual.squaredNorm(ctx);
if (residualNorm2 < threshold) break;
// z = precond.solve(r) = invdiag .* r
z.cwiseProduct(ctx, d_invdiag, residual);
auto absOld = std::move(absNew);
absNew = residual.dot(ctx, z);
auto beta = absNew / absOld;
p *= beta;
p += z;
i++;
}
GpuContext::setThreadLocal(nullptr);
Vec x_gpu = d_x.toHost(ctx.stream());
Vec r = A * x_gpu - b;
RealScalar relres = r.norm() / b.norm();
VERIFY(relres < RealScalar(1e-6));
RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
}
EIGEN_DECLARE_TEST(gpu_cg) {
CALL_SUBTEST(test_gpu_cg<double>(64));
CALL_SUBTEST(test_gpu_cg<double>(256));
CALL_SUBTEST(test_gpu_cg<float>(64));
CALL_SUBTEST(test_gpu_cg_jacobi<double>(64));
CALL_SUBTEST(test_gpu_cg_jacobi<double>(256));
CALL_SUBTEST(test_gpu_cg_jacobi<float>(64));
}