mirror of
https://gitlab.com/libeigen/eigen.git
synced 2026-04-10 11:34:33 +08:00
Add the operator interface needed for GPU iterative solvers: - BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(), noalias(), operator+=/-=/\*= dispatching to cuBLAS axpy/scal/dot/nrm2. - DeviceScalar<Scalar>: device-resident scalar returned by reductions. Defers host sync until value is read (implicit conversion). Device-side division via NPP for real types. - GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(), cusparseHandle(). - GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm selection and plan caching. - GpuSparseContext: GpuContext& constructor for same-stream execution, deviceView() returning DeviceSparseView with operator* for device-resident SpMV (d_y = d_A * d_x). - geam expressions: d_C = d_A + alpha * d_B via cublasXgeam. - GpuSVD::matrixV() convenience wrapper. These additions make DeviceMatrix usable as a VectorType in Eigen algorithm templates. Conjugate gradient is the motivating example and is tested against CPU ConjugateGradient for correctness. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
292 lines
12 KiB
Plaintext
292 lines
12 KiB
Plaintext
// Benchmark: GPU Conjugate Gradient via DeviceMatrix operators.
|
|
//
|
|
// Shows the path to running Eigen's CG on GPU with minimal code changes.
|
|
// The DeviceMatrix benchmark mirrors Eigen's conjugate_gradient() line-by-line.
|
|
// A raw cuBLAS device-pointer-mode implementation is included as a lower bound.
|
|
//
|
|
// The only change needed in Eigen's CG template to support DeviceMatrix:
|
|
// Line 34: typedef Dest VectorType; (instead of Matrix<Scalar, Dynamic, 1>)
|
|
//
|
|
// Usage:
|
|
// cmake --build build-bench-gpu --target bench_gpu_cg_sync
|
|
// ./build-bench-gpu/bench_gpu_cg_sync
|
|
|
|
#include <benchmark/benchmark.h>
|
|
|
|
#include <Eigen/Sparse>
|
|
#include <Eigen/GPU>
|
|
#include <cusparse.h>
|
|
|
|
using namespace Eigen;
|
|
|
|
using Scalar = double;
|
|
using RealScalar = double;
|
|
using Vec = Matrix<Scalar, Dynamic, 1>;
|
|
using SpMat = SparseMatrix<Scalar, ColMajor, int>;
|
|
|
|
static SpMat make_spd(Index n) {
|
|
SpMat A(n, n);
|
|
A.reserve(VectorXi::Constant(n, 3));
|
|
for (Index i = 0; i < n; ++i) {
|
|
A.insert(i, i) = 4.0;
|
|
if (i > 0) A.insert(i, i - 1) = -1.0;
|
|
if (i < n - 1) A.insert(i, i + 1) = -1.0;
|
|
}
|
|
A.makeCompressed();
|
|
return A;
|
|
}
|
|
|
|
static void cuda_warmup() {
|
|
static bool done = false;
|
|
if (!done) {
|
|
void* p;
|
|
cudaMalloc(&p, 1);
|
|
cudaFree(p);
|
|
done = true;
|
|
}
|
|
}
|
|
|
|
// ==========================================================================
|
|
// GPU CG using DeviceMatrix operators — mirrors Eigen's conjugate_gradient()
|
|
// ==========================================================================
|
|
//
|
|
// Compare with Eigen/src/IterativeLinearSolvers/ConjugateGradient.h lines 29-84.
|
|
// Left column: Eigen CG code. Right column: this benchmark.
|
|
//
|
|
// Eigen CG GPU CG (this benchmark)
|
|
// -------- -----------------------
|
|
// VectorType residual = rhs - mat * x; residual.copyFrom(ctx, rhs); [x=0 so r=b]
|
|
// RealScalar rhsNorm2 = rhs.sqNorm(); RealScalar rhsNorm2 = rhs.squaredNorm();
|
|
// ...
|
|
// tmp.noalias() = mat * p; tmp.noalias() = mat * p; [identical]
|
|
// Scalar alpha = absNew / p.dot(tmp); Scalar alpha = absNew / p.dot(tmp); [identical]
|
|
// x += alpha * p; x += alpha * p; [identical]
|
|
// residual -= alpha * tmp; residual -= alpha * tmp; [identical]
|
|
// residualNorm2 = residual.sqNorm(); residualNorm2 = residual.squaredNorm(); [identical]
|
|
// ...
|
|
// p = z + beta * p; p *= beta; p += z; [equivalent, no alloc]
|
|
|
|
static void BM_CG_DeviceMatrixOps(benchmark::State& state) {
|
|
cuda_warmup();
|
|
const Index n = state.range(0);
|
|
|
|
SpMat A = make_spd(n);
|
|
Vec b = Vec::Random(n);
|
|
|
|
// One shared context: SpMV + BLAS-1 on same stream, zero event overhead.
|
|
GpuContext ctx;
|
|
GpuContext::setThreadLocal(&ctx);
|
|
GpuSparseContext<Scalar> spmv(ctx);
|
|
auto mat = spmv.deviceView(A);
|
|
|
|
// Upload RHS once.
|
|
auto rhs = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
|
|
|
|
for (auto _ : state) {
|
|
// --- Eigen CG lines 34-63: initialization ---
|
|
// typedef Dest VectorType; // GPU CHANGE: was Matrix<Scalar,Dynamic,1>
|
|
// VectorType residual = rhs - mat * x; // x=0, so residual = rhs
|
|
DeviceMatrix<Scalar> x(n, 1);
|
|
x.setZero();
|
|
DeviceMatrix<Scalar> residual(n, 1);
|
|
residual.copyFrom(ctx, rhs);
|
|
|
|
// RealScalar rhsNorm2 = rhs.squaredNorm();
|
|
RealScalar rhsNorm2 = rhs.squaredNorm();
|
|
if (rhsNorm2 == 0) continue;
|
|
|
|
RealScalar tol = 1e-10;
|
|
const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
|
|
RealScalar threshold = numext::maxi(RealScalar(tol * tol * rhsNorm2), considerAsZero);
|
|
|
|
// RealScalar residualNorm2 = residual.squaredNorm();
|
|
RealScalar residualNorm2 = residual.squaredNorm();
|
|
if (residualNorm2 < threshold) continue;
|
|
|
|
// VectorType p(n);
|
|
// p = precond.solve(residual); // no preconditioner: p = residual
|
|
DeviceMatrix<Scalar> p(n, 1);
|
|
p.copyFrom(ctx, residual);
|
|
|
|
// VectorType z(n), tmp(n);
|
|
DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
|
|
|
|
// auto absNew = numext::real(residual.dot(p));
|
|
// DeviceScalar — stays on device, no sync.
|
|
auto absNew = residual.dot(p); // DeviceScalar, no sync
|
|
|
|
// while (i < maxIters) {
|
|
Index maxIters = 200;
|
|
Index i = 0;
|
|
while (i < maxIters) {
|
|
// tmp.noalias() = mat * p;
|
|
tmp.noalias() = mat * p; // SpMV, device-resident
|
|
|
|
// auto alpha = absNew / p.dot(tmp);
|
|
// DeviceScalar / DeviceScalar → device kernel, no sync!
|
|
auto alpha = absNew / p.dot(tmp); // DeviceScalar, no sync
|
|
|
|
// x += alpha * p;
|
|
// DeviceScalar * DeviceMatrix → device-pointer axpy, no sync!
|
|
x += alpha * p;
|
|
|
|
// residual -= alpha * tmp;
|
|
residual -= alpha * tmp; // device-pointer axpy, no sync
|
|
|
|
// residualNorm2 = residual.squaredNorm();
|
|
residualNorm2 = residual.squaredNorm(); // THE one sync per iteration
|
|
|
|
// if (residualNorm2 < threshold) break;
|
|
if (residualNorm2 < threshold) break;
|
|
|
|
// z = precond.solve(residual);
|
|
z.copyFrom(ctx, residual); // no preconditioner
|
|
|
|
// auto absOld = std::move(absNew);
|
|
auto absOld = std::move(absNew); // no sync, no alloc
|
|
|
|
// absNew = numext::real(residual.dot(z));
|
|
absNew = residual.dot(z); // DeviceScalar, no sync
|
|
|
|
// auto beta = absNew / absOld;
|
|
// DeviceScalar / DeviceScalar → device kernel, no sync!
|
|
auto beta = absNew / absOld; // DeviceScalar, no sync
|
|
|
|
// p = z + beta * p;
|
|
p *= beta; // device-pointer scal, no host sync
|
|
p += z;
|
|
|
|
i++;
|
|
}
|
|
}
|
|
|
|
GpuContext::setThreadLocal(nullptr);
|
|
state.SetItemsProcessed(state.iterations() * 200);
|
|
}
|
|
|
|
BENCHMARK(BM_CG_DeviceMatrixOps)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
|
|
|
|
// ==========================================================================
|
|
// Raw cuBLAS device-pointer-mode CG (1 sync/iter) — performance lower bound
|
|
// ==========================================================================
|
|
|
|
__global__ void scalar_div_kernel(const Scalar* a, const Scalar* b, Scalar* out) { *out = *a / *b; }
|
|
__global__ void scalar_neg_kernel(const Scalar* in, Scalar* out) { *out = -(*in); }
|
|
|
|
static void BM_CG_DevicePointerMode(benchmark::State& state) {
|
|
cuda_warmup();
|
|
const Index n = state.range(0);
|
|
const int maxIters = 200;
|
|
|
|
SpMat A = make_spd(n);
|
|
Vec b = Vec::Random(n);
|
|
|
|
cudaStream_t stream;
|
|
cudaStreamCreate(&stream);
|
|
cublasHandle_t cublas;
|
|
cublasCreate(&cublas);
|
|
cublasSetStream(cublas, stream);
|
|
|
|
cusparseHandle_t cusparse;
|
|
cusparseCreate(&cusparse);
|
|
cusparseSetStream(cusparse, stream);
|
|
|
|
internal::DeviceBuffer d_outer((n + 1) * sizeof(int));
|
|
internal::DeviceBuffer d_inner(A.nonZeros() * sizeof(int));
|
|
internal::DeviceBuffer d_vals(A.nonZeros() * sizeof(Scalar));
|
|
cudaMemcpy(d_outer.ptr, A.outerIndexPtr(), (n + 1) * sizeof(int), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(d_inner.ptr, A.innerIndexPtr(), A.nonZeros() * sizeof(int), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(d_vals.ptr, A.valuePtr(), A.nonZeros() * sizeof(Scalar), cudaMemcpyHostToDevice);
|
|
|
|
cusparseSpMatDescr_t matA;
|
|
cusparseCreateCsc(&matA, n, n, A.nonZeros(), d_outer.ptr, d_inner.ptr, d_vals.ptr, CUSPARSE_INDEX_32I,
|
|
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
|
|
|
|
internal::DeviceBuffer d_tmp_buf(n * sizeof(Scalar));
|
|
cusparseDnVecDescr_t tmp_x, tmp_y;
|
|
cusparseCreateDnVec(&tmp_x, n, d_tmp_buf.ptr, CUDA_R_64F);
|
|
cusparseCreateDnVec(&tmp_y, n, d_tmp_buf.ptr, CUDA_R_64F);
|
|
Scalar spmv_alpha = 1.0, spmv_beta = 0.0;
|
|
size_t ws_size = 0;
|
|
cusparseSpMV_bufferSize(cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &spmv_alpha, matA, tmp_x, &spmv_beta, tmp_y,
|
|
CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &ws_size);
|
|
internal::DeviceBuffer d_workspace(ws_size);
|
|
cusparseDestroyDnVec(tmp_x);
|
|
cusparseDestroyDnVec(tmp_y);
|
|
|
|
internal::DeviceBuffer d_x(n * sizeof(Scalar)), d_r(n * sizeof(Scalar));
|
|
internal::DeviceBuffer d_p(n * sizeof(Scalar)), d_tmp(n * sizeof(Scalar));
|
|
internal::DeviceBuffer d_b(n * sizeof(Scalar));
|
|
internal::DeviceBuffer d_absNew(sizeof(Scalar)), d_absOld(sizeof(Scalar));
|
|
internal::DeviceBuffer d_pdot(sizeof(Scalar)), d_alpha(sizeof(Scalar));
|
|
internal::DeviceBuffer d_neg_alpha(sizeof(Scalar)), d_beta(sizeof(Scalar));
|
|
internal::DeviceBuffer d_rnorm(sizeof(RealScalar));
|
|
|
|
cudaMemcpy(d_b.ptr, b.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice);
|
|
|
|
auto spmv = [&](Scalar* x_ptr, Scalar* y_ptr) {
|
|
cusparseDnVecDescr_t vx, vy;
|
|
cusparseCreateDnVec(&vx, n, x_ptr, CUDA_R_64F);
|
|
cusparseCreateDnVec(&vy, n, y_ptr, CUDA_R_64F);
|
|
cusparseSpMV(cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &spmv_alpha, matA, vx, &spmv_beta, vy, CUDA_R_64F,
|
|
CUSPARSE_SPMV_ALG_DEFAULT, d_workspace.ptr);
|
|
cusparseDestroyDnVec(vx);
|
|
cusparseDestroyDnVec(vy);
|
|
};
|
|
|
|
for (auto _ : state) {
|
|
cudaMemsetAsync(static_cast<Scalar*>(d_x.ptr), 0, n * sizeof(Scalar), stream);
|
|
cudaMemcpyAsync(d_r.ptr, d_b.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
|
|
cudaMemcpyAsync(d_p.ptr, d_b.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
|
|
|
|
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_DEVICE);
|
|
cublasDdot(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_p.ptr), 1,
|
|
static_cast<Scalar*>(d_absNew.ptr));
|
|
|
|
for (int i = 0; i < maxIters; ++i) {
|
|
spmv(static_cast<Scalar*>(d_p.ptr), static_cast<Scalar*>(d_tmp.ptr));
|
|
|
|
cublasDdot(cublas, n, static_cast<Scalar*>(d_p.ptr), 1, static_cast<Scalar*>(d_tmp.ptr), 1,
|
|
static_cast<Scalar*>(d_pdot.ptr));
|
|
|
|
scalar_div_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_absNew.ptr), static_cast<Scalar*>(d_pdot.ptr),
|
|
static_cast<Scalar*>(d_alpha.ptr));
|
|
scalar_neg_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_alpha.ptr), static_cast<Scalar*>(d_neg_alpha.ptr));
|
|
|
|
cublasDaxpy(cublas, n, static_cast<Scalar*>(d_alpha.ptr), static_cast<Scalar*>(d_p.ptr), 1,
|
|
static_cast<Scalar*>(d_x.ptr), 1);
|
|
cublasDaxpy(cublas, n, static_cast<Scalar*>(d_neg_alpha.ptr), static_cast<Scalar*>(d_tmp.ptr), 1,
|
|
static_cast<Scalar*>(d_r.ptr), 1);
|
|
|
|
cublasDnrm2(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<RealScalar*>(d_rnorm.ptr));
|
|
|
|
RealScalar rnorm;
|
|
cudaMemcpyAsync(&rnorm, d_rnorm.ptr, sizeof(RealScalar), cudaMemcpyDeviceToHost, stream);
|
|
cudaStreamSynchronize(stream);
|
|
if (rnorm * rnorm < 1e-20) break;
|
|
|
|
cudaMemcpyAsync(d_absOld.ptr, d_absNew.ptr, sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
|
|
cublasDdot(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_r.ptr), 1,
|
|
static_cast<Scalar*>(d_absNew.ptr));
|
|
|
|
scalar_div_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_absNew.ptr), static_cast<Scalar*>(d_absOld.ptr),
|
|
static_cast<Scalar*>(d_beta.ptr));
|
|
|
|
cublasDscal(cublas, n, static_cast<Scalar*>(d_beta.ptr), static_cast<Scalar*>(d_p.ptr), 1);
|
|
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_HOST);
|
|
Scalar one = 1.0;
|
|
cublasDaxpy(cublas, n, &one, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_p.ptr), 1);
|
|
cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_DEVICE);
|
|
}
|
|
cudaStreamSynchronize(stream);
|
|
}
|
|
|
|
state.SetItemsProcessed(state.iterations() * maxIters);
|
|
cusparseDestroySpMat(matA);
|
|
cusparseDestroy(cusparse);
|
|
cublasDestroy(cublas);
|
|
cudaStreamDestroy(stream);
|
|
}
|
|
|
|
BENCHMARK(BM_CG_DevicePointerMode)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
|