mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Add the operator interface needed for GPU iterative solvers: - BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(), noalias(), operator+=/-=/\*= dispatching to cuBLAS axpy/scal/dot/nrm2. - DeviceScalar<Scalar>: device-resident scalar returned by reductions. Defers host sync until value is read (implicit conversion). Device-side division via NPP for real types. - GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(), cusparseHandle(). - GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm selection and plan caching. - GpuSparseContext: GpuContext& constructor for same-stream execution, deviceView() returning DeviceSparseView with operator* for device-resident SpMV (d_y = d_A * d_x). - geam expressions: d_C = d_A + alpha * d_B via cublasXgeam. - GpuSVD::matrixV() convenience wrapper. These additions make DeviceMatrix usable as a VectorType in Eigen algorithm templates. Conjugate gradient is the motivating example and is tested against CPU ConjugateGradient for correctness. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
225 lines
6.5 KiB
C++
225 lines
6.5 KiB
C++
// This file is part of Eigen, a lightweight C++ template library
|
|
// for linear algebra.
|
|
//
|
|
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
|
|
//
|
|
// This Source Code Form is subject to the terms of the Mozilla
|
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
// End-to-end test: CG algorithm running on GPU via DeviceMatrix.
|
|
//
|
|
// Uses DeviceSparseView for SpMV, DeviceMatrix for vectors, DeviceScalar
|
|
// for deferred reductions. Verifies correctness against CPU ConjugateGradient.
|
|
|
|
#define EIGEN_USE_GPU
|
|
#include "main.h"
|
|
#include <Eigen/Sparse>
|
|
#include <Eigen/IterativeLinearSolvers>
|
|
#include <Eigen/GPU>
|
|
|
|
using namespace Eigen;
|
|
|
|
// ---- Helper: build a sparse SPD matrix --------------------------------------
|
|
|
|
template <typename Scalar>
|
|
SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
|
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
SpMat R(n, n);
|
|
R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
|
|
for (Index j = 0; j < n; ++j) {
|
|
for (Index i = 0; i < n; ++i) {
|
|
if (i == j || (std::rand() / double(RAND_MAX)) < density) {
|
|
R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
|
|
}
|
|
}
|
|
}
|
|
R.makeCompressed();
|
|
SpMat A = R.adjoint() * R;
|
|
for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
|
|
A.makeCompressed();
|
|
return A;
|
|
}
|
|
|
|
// ---- GPU CG without preconditioner ------------------------------------------
|
|
|
|
template <typename Scalar>
|
|
void test_gpu_cg(Index n) {
|
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
SpMat A = make_spd<Scalar>(n);
|
|
Vec b = Vec::Random(n);
|
|
|
|
// CPU reference (identity preconditioner to match GPU).
|
|
ConjugateGradient<SpMat, Lower | Upper, IdentityPreconditioner> cpu_cg;
|
|
cpu_cg.setMaxIterations(1000);
|
|
cpu_cg.setTolerance(RealScalar(1e-8));
|
|
cpu_cg.compute(A);
|
|
Vec x_cpu = cpu_cg.solve(b);
|
|
VERIFY_IS_EQUAL(cpu_cg.info(), Success);
|
|
|
|
// GPU CG: mirrors Eigen's conjugate_gradient() using DeviceMatrix ops.
|
|
GpuContext ctx;
|
|
GpuContext::setThreadLocal(&ctx);
|
|
GpuSparseContext<Scalar> spmv_ctx(ctx);
|
|
auto mat = spmv_ctx.deviceView(A);
|
|
|
|
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
|
|
DeviceMatrix<Scalar> d_x(n, 1);
|
|
d_x.setZero(ctx);
|
|
|
|
// r = b (since x=0)
|
|
DeviceMatrix<Scalar> residual(n, 1);
|
|
residual.copyFrom(ctx, d_b);
|
|
|
|
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
|
|
RealScalar tol = RealScalar(1e-8);
|
|
RealScalar threshold = tol * tol * rhsNorm2;
|
|
RealScalar residualNorm2 = residual.squaredNorm(ctx);
|
|
|
|
// p = r (no preconditioner)
|
|
DeviceMatrix<Scalar> p(n, 1);
|
|
p.copyFrom(ctx, residual);
|
|
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
|
|
|
|
auto absNew = residual.dot(ctx, p);
|
|
Index maxIters = 1000;
|
|
Index i = 0;
|
|
while (i < maxIters) {
|
|
tmp.noalias() = mat * p;
|
|
|
|
auto alpha = absNew / p.dot(ctx, tmp);
|
|
d_x += alpha * p;
|
|
residual -= alpha * tmp;
|
|
|
|
residualNorm2 = residual.squaredNorm(ctx);
|
|
if (residualNorm2 < threshold) break;
|
|
|
|
// z = r (no preconditioner)
|
|
z.copyFrom(ctx, residual);
|
|
|
|
auto absOld = std::move(absNew);
|
|
absNew = residual.dot(ctx, z);
|
|
auto beta = absNew / absOld;
|
|
|
|
p *= Scalar(beta);
|
|
p += z;
|
|
i++;
|
|
}
|
|
|
|
GpuContext::setThreadLocal(nullptr);
|
|
|
|
Vec x_gpu = d_x.toHost(ctx.stream());
|
|
|
|
// Verify residual.
|
|
Vec r = A * x_gpu - b;
|
|
RealScalar relres = r.norm() / b.norm();
|
|
VERIFY(relres < RealScalar(1e-6));
|
|
|
|
// Compare with CPU.
|
|
RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
|
VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
|
|
}
|
|
|
|
// ---- GPU CG with Jacobi preconditioner --------------------------------------
|
|
|
|
template <typename Scalar>
|
|
void test_gpu_cg_jacobi(Index n) {
|
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
|
using RealScalar = typename NumTraits<Scalar>::Real;
|
|
|
|
SpMat A = make_spd<Scalar>(n);
|
|
Vec b = Vec::Random(n);
|
|
|
|
// CPU reference.
|
|
ConjugateGradient<SpMat, Lower | Upper> cpu_cg;
|
|
cpu_cg.setMaxIterations(1000);
|
|
cpu_cg.setTolerance(RealScalar(1e-8));
|
|
cpu_cg.compute(A);
|
|
Vec x_cpu = cpu_cg.solve(b);
|
|
|
|
// Extract inverse diagonal.
|
|
Vec invdiag(n);
|
|
for (Index j = 0; j < A.outerSize(); ++j) {
|
|
typename SpMat::InnerIterator it(A, j);
|
|
while (it && it.index() != j) ++it;
|
|
if (it && it.index() == j && it.value() != Scalar(0))
|
|
invdiag(j) = Scalar(1) / it.value();
|
|
else
|
|
invdiag(j) = Scalar(1);
|
|
}
|
|
|
|
// GPU CG with Jacobi preconditioner.
|
|
GpuContext ctx;
|
|
GpuContext::setThreadLocal(&ctx);
|
|
GpuSparseContext<Scalar> spmv_ctx(ctx);
|
|
auto mat = spmv_ctx.deviceView(A);
|
|
auto d_invdiag = DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());
|
|
|
|
auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
|
|
DeviceMatrix<Scalar> d_x(n, 1);
|
|
d_x.setZero(ctx);
|
|
|
|
DeviceMatrix<Scalar> residual(n, 1);
|
|
residual.copyFrom(ctx, d_b);
|
|
|
|
RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
|
|
RealScalar tol = RealScalar(1e-8);
|
|
RealScalar threshold = tol * tol * rhsNorm2;
|
|
RealScalar residualNorm2 = residual.squaredNorm(ctx);
|
|
|
|
// p = precond.solve(r) = invdiag .* r
|
|
DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
|
|
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
|
|
|
|
auto absNew = residual.dot(ctx, p);
|
|
Index maxIters = 1000;
|
|
Index i = 0;
|
|
while (i < maxIters) {
|
|
tmp.noalias() = mat * p;
|
|
|
|
auto alpha = absNew / p.dot(ctx, tmp);
|
|
d_x += alpha * p;
|
|
residual -= alpha * tmp;
|
|
|
|
residualNorm2 = residual.squaredNorm(ctx);
|
|
if (residualNorm2 < threshold) break;
|
|
|
|
// z = precond.solve(r) = invdiag .* r
|
|
z.cwiseProduct(ctx, d_invdiag, residual);
|
|
|
|
auto absOld = std::move(absNew);
|
|
absNew = residual.dot(ctx, z);
|
|
auto beta = absNew / absOld;
|
|
|
|
p *= beta;
|
|
p += z;
|
|
i++;
|
|
}
|
|
|
|
GpuContext::setThreadLocal(nullptr);
|
|
|
|
Vec x_gpu = d_x.toHost(ctx.stream());
|
|
|
|
Vec r = A * x_gpu - b;
|
|
RealScalar relres = r.norm() / b.norm();
|
|
VERIFY(relres < RealScalar(1e-6));
|
|
|
|
RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
|
|
VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
|
|
}
|
|
|
|
EIGEN_DECLARE_TEST(gpu_cg) {
|
|
CALL_SUBTEST(test_gpu_cg<double>(64));
|
|
CALL_SUBTEST(test_gpu_cg<double>(256));
|
|
CALL_SUBTEST(test_gpu_cg<float>(64));
|
|
CALL_SUBTEST(test_gpu_cg_jacobi<double>(64));
|
|
CALL_SUBTEST(test_gpu_cg_jacobi<double>(256));
|
|
CALL_SUBTEST(test_gpu_cg_jacobi<float>(64));
|
|
}
|