GPU: Add BLAS-1 ops, DeviceScalar, device-resident SpMV, and CG interop (5/5)

Add the operator interface needed for GPU iterative solvers: - BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(), noalias(), operator+=/-=/\*= dispatching to cuBLAS axpy/scal/dot/nrm2. - DeviceScalar<Scalar>: device-resident scalar returned by reductions. Defers host sync until value is read (implicit conversion). Device-side division via NPP for real types. - GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(), cusparseHandle(). - GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm selection and plan caching. - GpuSparseContext: GpuContext& constructor for same-stream execution, deviceView() returning DeviceSparseView with operator* for device-resident SpMV (d_y = d_A * d_x). - geam expressions: d_C = d_A + alpha * d_B via cublasXgeam. - GpuSVD::matrixV() convenience wrapper. These additions make DeviceMatrix usable as a VectorType in Eigen algorithm templates. Conjugate gradient is the motivating example and is tested against CPU ConjugateGradient for correctness. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-10 11:34:33 +08:00 · 2026-04-09 19:54:13 -07:00
parent 43a95b62bb
commit 014f12f11a
22 changed files with 3363 additions and 151 deletions
--- a/test/gpu_cusparse_spmv.cpp
+++ b/test/gpu_cusparse_spmv.cpp
@@ -180,6 +180,105 @@ void test_empty() {
  VERIFY_IS_EQUAL(y.size(), 0);
 }

+// ---- DeviceMatrix SpMV (no host roundtrip) ----------------------------------
+
+template <typename Scalar>
+void test_spmv_device(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_sparse<Scalar>(n, n);
+  Vec x = Vec::Random(n);
+
+  // Use shared GpuContext for same-stream execution.
+  GpuContext gpu_ctx;
+  GpuSparseContext<Scalar> ctx(gpu_ctx);
+
+  auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
+  DeviceMatrix<Scalar> d_y;
+
+  ctx.multiply(A, d_x, d_y);
+
+  Vec y_gpu = d_y.toHost(gpu_ctx.stream());
+  Vec y_cpu = A * x;
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+}
+
+// ---- Expression syntax: d_y = d_A * d_x ------------------------------------
+
+template <typename Scalar>
+void test_spmv_expr(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_sparse<Scalar>(n, n);
+  Vec x = Vec::Random(n);
+
+  GpuContext gpu_ctx;
+  GpuSparseContext<Scalar> ctx(gpu_ctx);
+
+  // Upload sparse matrix and create device view.
+  auto d_A = ctx.deviceView(A);
+
+  // Upload x.
+  auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
+
+  // Expression syntax: d_y = d_A * d_x
+  DeviceMatrix<Scalar> d_y;
+  d_y = d_A * d_x;
+
+  // Also test with noalias():
+  DeviceMatrix<Scalar> d_tmp;
+  d_tmp.noalias() = d_A * d_x;
+
+  Vec y_gpu = d_y.toHost(gpu_ctx.stream());
+  Vec tmp_gpu = d_tmp.toHost(gpu_ctx.stream());
+  Vec y_cpu = A * x;
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+  VERIFY((tmp_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+}
+
+// ---- deviceView overwrite: second view replaces first -----------------------
+
+template <typename Scalar>
+void test_deviceview_overwrite(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A1 = make_sparse<Scalar>(n, n);
+  SpMat A2 = make_sparse<Scalar>(n, n);  // different random matrix
+
+  Vec x = Vec::Random(n);
+
+  GpuContext gpu_ctx;
+  GpuSparseContext<Scalar> ctx(gpu_ctx);
+
+  // First view: A1.
+  auto d_A1 = ctx.deviceView(A1);
+  auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
+  DeviceMatrix<Scalar> d_y1;
+  d_y1 = d_A1 * d_x;
+  Vec y1_gpu = d_y1.toHost(gpu_ctx.stream());
+  Vec y1_cpu = A1 * x;
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((y1_gpu - y1_cpu).norm() / (y1_cpu.norm() + RealScalar(1)) < tol);
+
+  // Second view overwrites first: now uses A2.
+  auto d_A2 = ctx.deviceView(A2);
+  DeviceMatrix<Scalar> d_y2;
+  d_y2 = d_A2 * d_x;
+  Vec y2_gpu = d_y2.toHost(gpu_ctx.stream());
+  Vec y2_cpu = A2 * x;
+  VERIFY((y2_gpu - y2_cpu).norm() / (y2_cpu.norm() + RealScalar(1)) < tol);
+}
+
 // ---- Per-scalar driver ------------------------------------------------------

 template <typename Scalar>
@@ -193,6 +292,9 @@ void test_scalar() {
  CALL_SUBTEST(test_identity<Scalar>(64));
  CALL_SUBTEST(test_reuse<Scalar>(64));
  CALL_SUBTEST(test_empty<Scalar>());
+  CALL_SUBTEST(test_spmv_device<Scalar>(64));
+  CALL_SUBTEST(test_spmv_expr<Scalar>(64));
+  CALL_SUBTEST(test_deviceview_overwrite<Scalar>(64));
 }

 EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {