test/gpu_cusparse_spmv.cpp

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

// Tests for GpuSparseContext: GPU SpMV/SpMM via cuSPARSE.

#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Sparse>
#include <Eigen/GPU>

using namespace Eigen;

// ---- Helper: build a random sparse matrix -----------------------------------

template <typename Scalar>
SparseMatrix<Scalar, ColMajor, int> make_sparse(Index rows, Index cols, double density = 0.1) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  SpMat R(rows, cols);
  R.reserve(VectorXi::Constant(cols, static_cast<int>(rows * density) + 1));
  for (Index j = 0; j < cols; ++j) {
    for (Index i = 0; i < rows; ++i) {
      if ((std::rand() / double(RAND_MAX)) < density) {
        R.insert(i, j) = Scalar(RealScalar(std::rand() / double(RAND_MAX) - 0.5));
      }
    }
  }
  R.makeCompressed();
  return R;
}

// ---- SpMV: y = A * x -------------------------------------------------------

template <typename Scalar>
void test_spmv(Index rows, Index cols) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  SpMat A = make_sparse<Scalar>(rows, cols);
  Vec x = Vec::Random(cols);

  GpuSparseContext<Scalar> ctx;
  Vec y_gpu = ctx.multiply(A, x);
  Vec y_cpu = A * x;

  RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
  VERIFY_IS_EQUAL(y_gpu.size(), rows);
  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}

// ---- SpMV with alpha/beta: y = alpha*A*x + beta*y ---------------------------

template <typename Scalar>
void test_spmv_alpha_beta(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  SpMat A = make_sparse<Scalar>(n, n);
  Vec x = Vec::Random(n);
  Vec y_init = Vec::Random(n);

  Scalar alpha(2);
  Scalar beta(3);

  Vec y_cpu = alpha * (A * x) + beta * y_init;

  GpuSparseContext<Scalar> ctx;
  Vec y_gpu = y_init;
  ctx.multiply(A, x, y_gpu, alpha, beta);

  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}

// ---- Transpose: y = A^T * x ------------------------------------------------

template <typename Scalar>
void test_spmv_transpose(Index rows, Index cols) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  SpMat A = make_sparse<Scalar>(rows, cols);
  Vec x = Vec::Random(rows);

  GpuSparseContext<Scalar> ctx;
  Vec y_gpu = ctx.multiplyT(A, x);
  Vec y_cpu = A.transpose() * x;

  RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
  VERIFY_IS_EQUAL(y_gpu.size(), cols);
  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}

// ---- SpMM: Y = A * X (multiple RHS) ----------------------------------------

template <typename Scalar>
void test_spmm(Index rows, Index cols, Index nrhs) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  SpMat A = make_sparse<Scalar>(rows, cols);
  Mat X = Mat::Random(cols, nrhs);

  GpuSparseContext<Scalar> ctx;
  Mat Y_gpu = ctx.multiplyMat(A, X);
  Mat Y_cpu = A * X;

  RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
  VERIFY_IS_EQUAL(Y_gpu.rows(), rows);
  VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);
  VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);
}

// ---- Identity matrix: I * x = x --------------------------------------------

template <typename Scalar>
void test_identity(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  // Build sparse identity.
  SpMat eye(n, n);
  eye.setIdentity();
  eye.makeCompressed();

  Vec x = Vec::Random(n);

  GpuSparseContext<Scalar> ctx;
  Vec y = ctx.multiply(eye, x);

  RealScalar tol = NumTraits<Scalar>::epsilon();
  VERIFY((y - x).norm() < tol);
}

// ---- Context reuse ----------------------------------------------------------

template <typename Scalar>
void test_reuse(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  GpuSparseContext<Scalar> ctx;
  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();

  for (int trial = 0; trial < 3; ++trial) {
    SpMat A = make_sparse<Scalar>(n, n);
    Vec x = Vec::Random(n);
    Vec y_gpu = ctx.multiply(A, x);
    Vec y_cpu = A * x;
    VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
  }
}

// ---- Empty ------------------------------------------------------------------

template <typename Scalar>
void test_empty() {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;

  SpMat A(0, 0);
  A.makeCompressed();
  Vec x(0);

  GpuSparseContext<Scalar> ctx;
  Vec y = ctx.multiply(A, x);
  VERIFY_IS_EQUAL(y.size(), 0);
}

// ---- DeviceMatrix SpMV (no host roundtrip) ----------------------------------

template <typename Scalar>
void test_spmv_device(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  SpMat A = make_sparse<Scalar>(n, n);
  Vec x = Vec::Random(n);

  // Use shared GpuContext for same-stream execution.
  GpuContext gpu_ctx;
  GpuSparseContext<Scalar> ctx(gpu_ctx);

  auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
  DeviceMatrix<Scalar> d_y;

  ctx.multiply(A, d_x, d_y);

  Vec y_gpu = d_y.toHost(gpu_ctx.stream());
  Vec y_cpu = A * x;

  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}

// ---- Expression syntax: d_y = d_A * d_x ------------------------------------

template <typename Scalar>
void test_spmv_expr(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  SpMat A = make_sparse<Scalar>(n, n);
  Vec x = Vec::Random(n);

  GpuContext gpu_ctx;
  GpuSparseContext<Scalar> ctx(gpu_ctx);

  // Upload sparse matrix and create device view.
  auto d_A = ctx.deviceView(A);

  // Upload x.
  auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());

  // Expression syntax: d_y = d_A * d_x
  DeviceMatrix<Scalar> d_y;
  d_y = d_A * d_x;

  // Also test with noalias():
  DeviceMatrix<Scalar> d_tmp;
  d_tmp.noalias() = d_A * d_x;

  Vec y_gpu = d_y.toHost(gpu_ctx.stream());
  Vec tmp_gpu = d_tmp.toHost(gpu_ctx.stream());
  Vec y_cpu = A * x;

  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
  VERIFY((tmp_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
}

// ---- deviceView overwrite: second view replaces first -----------------------

template <typename Scalar>
void test_deviceview_overwrite(Index n) {
  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
  using Vec = Matrix<Scalar, Dynamic, 1>;
  using RealScalar = typename NumTraits<Scalar>::Real;

  SpMat A1 = make_sparse<Scalar>(n, n);
  SpMat A2 = make_sparse<Scalar>(n, n);  // different random matrix

  Vec x = Vec::Random(n);

  GpuContext gpu_ctx;
  GpuSparseContext<Scalar> ctx(gpu_ctx);

  // First view: A1.
  auto d_A1 = ctx.deviceView(A1);
  auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
  DeviceMatrix<Scalar> d_y1;
  d_y1 = d_A1 * d_x;
  Vec y1_gpu = d_y1.toHost(gpu_ctx.stream());
  Vec y1_cpu = A1 * x;
  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
  VERIFY((y1_gpu - y1_cpu).norm() / (y1_cpu.norm() + RealScalar(1)) < tol);

  // Second view overwrites first: now uses A2.
  auto d_A2 = ctx.deviceView(A2);
  DeviceMatrix<Scalar> d_y2;
  d_y2 = d_A2 * d_x;
  Vec y2_gpu = d_y2.toHost(gpu_ctx.stream());
  Vec y2_cpu = A2 * x;
  VERIFY((y2_gpu - y2_cpu).norm() / (y2_cpu.norm() + RealScalar(1)) < tol);
}

// ---- Per-scalar driver ------------------------------------------------------

template <typename Scalar>
void test_scalar() {
  CALL_SUBTEST(test_spmv<Scalar>(64, 64));
  CALL_SUBTEST(test_spmv<Scalar>(128, 64));  // non-square
  CALL_SUBTEST(test_spmv<Scalar>(64, 128));  // wide
  CALL_SUBTEST(test_spmv_alpha_beta<Scalar>(64));
  CALL_SUBTEST(test_spmv_transpose<Scalar>(128, 64));
  CALL_SUBTEST(test_spmm<Scalar>(64, 64, 4));
  CALL_SUBTEST(test_identity<Scalar>(64));
  CALL_SUBTEST(test_reuse<Scalar>(64));
  CALL_SUBTEST(test_empty<Scalar>());
  CALL_SUBTEST(test_spmv_device<Scalar>(64));
  CALL_SUBTEST(test_spmv_expr<Scalar>(64));
  CALL_SUBTEST(test_deviceview_overwrite<Scalar>(64));
}

EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {
  CALL_SUBTEST(test_scalar<float>());
  CALL_SUBTEST(test_scalar<double>());
  CALL_SUBTEST(test_scalar<std::complex<float>>());
  CALL_SUBTEST(test_scalar<std::complex<double>>());
}
GPU: Add sparse solvers, FFT, and SpMV (cuDSS, cuFFT, cuSPARSE) Add GPU sparse direct solvers (Cholesky, LDL^T, LU) via cuDSS, 1D/2D FFT via cuFFT with plan caching, and sparse matrix-vector/matrix multiply (SpMV/SpMM) via cuSPARSE. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-09 19:11:49 -07:00			`// This file is part of Eigen, a lightweight C++ template library`
			`// for linear algebra.`
			`//`
			`// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>`
			`//`
			`// This Source Code Form is subject to the terms of the Mozilla`
			`// Public License v. 2.0. If a copy of the MPL was not distributed`
			`// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.`

			`// Tests for GpuSparseContext: GPU SpMV/SpMM via cuSPARSE.`

			`#define EIGEN_USE_GPU`
			`#include "main.h"`
			`#include <Eigen/Sparse>`
			`#include <Eigen/GPU>`

			`using namespace Eigen;`

			`// ---- Helper: build a random sparse matrix -----------------------------------`

			`template <typename Scalar>`
			`SparseMatrix<Scalar, ColMajor, int> make_sparse(Index rows, Index cols, double density = 0.1) {`
			`using SpMat = SparseMatrix<Scalar, ColMajor, int>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`SpMat R(rows, cols);`
			`R.reserve(VectorXi::Constant(cols, static_cast<int>(rows * density) + 1));`
			`for (Index j = 0; j < cols; ++j) {`
			`for (Index i = 0; i < rows; ++i) {`
			`if ((std::rand() / double(RAND_MAX)) < density) {`
			`R.insert(i, j) = Scalar(RealScalar(std::rand() / double(RAND_MAX) - 0.5));`
			`}`
			`}`
			`}`
			`R.makeCompressed();`
			`return R;`
			`}`

			`// ---- SpMV: y = A * x -------------------------------------------------------`

			`template <typename Scalar>`
			`void test_spmv(Index rows, Index cols) {`
			`using SpMat = SparseMatrix<Scalar, ColMajor, int>;`
			`using Vec = Matrix<Scalar, Dynamic, 1>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`SpMat A = make_sparse<Scalar>(rows, cols);`
			`Vec x = Vec::Random(cols);`

			`GpuSparseContext<Scalar> ctx;`
			`Vec y_gpu = ctx.multiply(A, x);`
			`Vec y_cpu = A * x;`

			`RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();`
			`VERIFY_IS_EQUAL(y_gpu.size(), rows);`
			`VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);`
			`}`

			`// ---- SpMV with alpha/beta: y = alphaAx + beta*y ---------------------------`

			`template <typename Scalar>`
			`void test_spmv_alpha_beta(Index n) {`
			`using SpMat = SparseMatrix<Scalar, ColMajor, int>;`
			`using Vec = Matrix<Scalar, Dynamic, 1>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`SpMat A = make_sparse<Scalar>(n, n);`
			`Vec x = Vec::Random(n);`
			`Vec y_init = Vec::Random(n);`

			`Scalar alpha(2);`
			`Scalar beta(3);`

			`Vec y_cpu = alpha * (A * x) + beta * y_init;`

			`GpuSparseContext<Scalar> ctx;`
			`Vec y_gpu = y_init;`
			`ctx.multiply(A, x, y_gpu, alpha, beta);`

			`RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();`
			`VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);`
			`}`

			`// ---- Transpose: y = A^T * x ------------------------------------------------`

			`template <typename Scalar>`
			`void test_spmv_transpose(Index rows, Index cols) {`
			`using SpMat = SparseMatrix<Scalar, ColMajor, int>;`
			`using Vec = Matrix<Scalar, Dynamic, 1>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`SpMat A = make_sparse<Scalar>(rows, cols);`
			`Vec x = Vec::Random(rows);`

			`GpuSparseContext<Scalar> ctx;`
			`Vec y_gpu = ctx.multiplyT(A, x);`
			`Vec y_cpu = A.transpose() * x;`

			`RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();`
			`VERIFY_IS_EQUAL(y_gpu.size(), cols);`
			`VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);`
			`}`

			`// ---- SpMM: Y = A * X (multiple RHS) ----------------------------------------`

			`template <typename Scalar>`
			`void test_spmm(Index rows, Index cols, Index nrhs) {`
			`using SpMat = SparseMatrix<Scalar, ColMajor, int>;`
			`using Mat = Matrix<Scalar, Dynamic, Dynamic>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`SpMat A = make_sparse<Scalar>(rows, cols);`
			`Mat X = Mat::Random(cols, nrhs);`

			`GpuSparseContext<Scalar> ctx;`
			`Mat Y_gpu = ctx.multiplyMat(A, X);`
			`Mat Y_cpu = A * X;`

			`RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();`
			`VERIFY_IS_EQUAL(Y_gpu.rows(), rows);`
			`VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);`
			`VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);`
			`}`

			`// ---- Identity matrix: I * x = x --------------------------------------------`

			`template <typename Scalar>`
			`void test_identity(Index n) {`
			`using SpMat = SparseMatrix<Scalar, ColMajor, int>;`
			`using Vec = Matrix<Scalar, Dynamic, 1>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`// Build sparse identity.`
			`SpMat eye(n, n);`
			`eye.setIdentity();`
			`eye.makeCompressed();`

			`Vec x = Vec::Random(n);`

			`GpuSparseContext<Scalar> ctx;`
			`Vec y = ctx.multiply(eye, x);`

			`RealScalar tol = NumTraits<Scalar>::epsilon();`
			`VERIFY((y - x).norm() < tol);`
			`}`

			`// ---- Context reuse ----------------------------------------------------------`

			`template <typename Scalar>`
			`void test_reuse(Index n) {`
			`using SpMat = SparseMatrix<Scalar, ColMajor, int>;`
			`using Vec = Matrix<Scalar, Dynamic, 1>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`GpuSparseContext<Scalar> ctx;`
			`RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();`

			`for (int trial = 0; trial < 3; ++trial) {`
			`SpMat A = make_sparse<Scalar>(n, n);`
			`Vec x = Vec::Random(n);`
			`Vec y_gpu = ctx.multiply(A, x);`
			`Vec y_cpu = A * x;`
			`VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);`
			`}`
			`}`

			`// ---- Empty ------------------------------------------------------------------`

			`template <typename Scalar>`
			`void test_empty() {`
			`using SpMat = SparseMatrix<Scalar, ColMajor, int>;`
			`using Vec = Matrix<Scalar, Dynamic, 1>;`

			`SpMat A(0, 0);`
			`A.makeCompressed();`
			`Vec x(0);`

			`GpuSparseContext<Scalar> ctx;`
			`Vec y = ctx.multiply(A, x);`
			`VERIFY_IS_EQUAL(y.size(), 0);`
			`}`

GPU: Add BLAS-1 ops, DeviceScalar, device-resident SpMV, and CG interop (5/5) Add the operator interface needed for GPU iterative solvers: - BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(), noalias(), operator+=/-=/\= dispatching to cuBLAS axpy/scal/dot/nrm2. - DeviceScalar<Scalar>: device-resident scalar returned by reductions. Defers host sync until value is read (implicit conversion). Device-side division via NPP for real types. - GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(), cusparseHandle(). - GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm selection and plan caching. - GpuSparseContext: GpuContext& constructor for same-stream execution, deviceView() returning DeviceSparseView with operator for device-resident SpMV (d_y = d_A * d_x). - geam expressions: d_C = d_A + alpha * d_B via cublasXgeam. - GpuSVD::matrixV() convenience wrapper. These additions make DeviceMatrix usable as a VectorType in Eigen algorithm templates. Conjugate gradient is the motivating example and is tested against CPU ConjugateGradient for correctness. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-09 19:54:13 -07:00			`// ---- DeviceMatrix SpMV (no host roundtrip) ----------------------------------`

			`template <typename Scalar>`
			`void test_spmv_device(Index n) {`
			`using SpMat = SparseMatrix<Scalar, ColMajor, int>;`
			`using Vec = Matrix<Scalar, Dynamic, 1>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`SpMat A = make_sparse<Scalar>(n, n);`
			`Vec x = Vec::Random(n);`

			`// Use shared GpuContext for same-stream execution.`
			`GpuContext gpu_ctx;`
			`GpuSparseContext<Scalar> ctx(gpu_ctx);`

			`auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());`
			`DeviceMatrix<Scalar> d_y;`

			`ctx.multiply(A, d_x, d_y);`

			`Vec y_gpu = d_y.toHost(gpu_ctx.stream());`
			`Vec y_cpu = A * x;`

			`RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();`
			`VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);`
			`}`

			`// ---- Expression syntax: d_y = d_A * d_x ------------------------------------`

			`template <typename Scalar>`
			`void test_spmv_expr(Index n) {`
			`using SpMat = SparseMatrix<Scalar, ColMajor, int>;`
			`using Vec = Matrix<Scalar, Dynamic, 1>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`SpMat A = make_sparse<Scalar>(n, n);`
			`Vec x = Vec::Random(n);`

			`GpuContext gpu_ctx;`
			`GpuSparseContext<Scalar> ctx(gpu_ctx);`

			`// Upload sparse matrix and create device view.`
			`auto d_A = ctx.deviceView(A);`

			`// Upload x.`
			`auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());`

			`// Expression syntax: d_y = d_A * d_x`
			`DeviceMatrix<Scalar> d_y;`
			`d_y = d_A * d_x;`

			`// Also test with noalias():`
			`DeviceMatrix<Scalar> d_tmp;`
			`d_tmp.noalias() = d_A * d_x;`

			`Vec y_gpu = d_y.toHost(gpu_ctx.stream());`
			`Vec tmp_gpu = d_tmp.toHost(gpu_ctx.stream());`
			`Vec y_cpu = A * x;`

			`RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();`
			`VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);`
			`VERIFY((tmp_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);`
			`}`

			`// ---- deviceView overwrite: second view replaces first -----------------------`

			`template <typename Scalar>`
			`void test_deviceview_overwrite(Index n) {`
			`using SpMat = SparseMatrix<Scalar, ColMajor, int>;`
			`using Vec = Matrix<Scalar, Dynamic, 1>;`
			`using RealScalar = typename NumTraits<Scalar>::Real;`

			`SpMat A1 = make_sparse<Scalar>(n, n);`
			`SpMat A2 = make_sparse<Scalar>(n, n); // different random matrix`

			`Vec x = Vec::Random(n);`

			`GpuContext gpu_ctx;`
			`GpuSparseContext<Scalar> ctx(gpu_ctx);`

			`// First view: A1.`
			`auto d_A1 = ctx.deviceView(A1);`
			`auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());`
			`DeviceMatrix<Scalar> d_y1;`
			`d_y1 = d_A1 * d_x;`
			`Vec y1_gpu = d_y1.toHost(gpu_ctx.stream());`
			`Vec y1_cpu = A1 * x;`
			`RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();`
			`VERIFY((y1_gpu - y1_cpu).norm() / (y1_cpu.norm() + RealScalar(1)) < tol);`

			`// Second view overwrites first: now uses A2.`
			`auto d_A2 = ctx.deviceView(A2);`
			`DeviceMatrix<Scalar> d_y2;`
			`d_y2 = d_A2 * d_x;`
			`Vec y2_gpu = d_y2.toHost(gpu_ctx.stream());`
			`Vec y2_cpu = A2 * x;`
			`VERIFY((y2_gpu - y2_cpu).norm() / (y2_cpu.norm() + RealScalar(1)) < tol);`
			`}`

GPU: Add sparse solvers, FFT, and SpMV (cuDSS, cuFFT, cuSPARSE) Add GPU sparse direct solvers (Cholesky, LDL^T, LU) via cuDSS, 1D/2D FFT via cuFFT with plan caching, and sparse matrix-vector/matrix multiply (SpMV/SpMM) via cuSPARSE. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-09 19:11:49 -07:00			`// ---- Per-scalar driver ------------------------------------------------------`

			`template <typename Scalar>`
			`void test_scalar() {`
			`CALL_SUBTEST(test_spmv<Scalar>(64, 64));`
			`CALL_SUBTEST(test_spmv<Scalar>(128, 64)); // non-square`
			`CALL_SUBTEST(test_spmv<Scalar>(64, 128)); // wide`
			`CALL_SUBTEST(test_spmv_alpha_beta<Scalar>(64));`
			`CALL_SUBTEST(test_spmv_transpose<Scalar>(128, 64));`
			`CALL_SUBTEST(test_spmm<Scalar>(64, 64, 4));`
			`CALL_SUBTEST(test_identity<Scalar>(64));`
			`CALL_SUBTEST(test_reuse<Scalar>(64));`
			`CALL_SUBTEST(test_empty<Scalar>());`
GPU: Add BLAS-1 ops, DeviceScalar, device-resident SpMV, and CG interop (5/5) Add the operator interface needed for GPU iterative solvers: - BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(), noalias(), operator+=/-=/\= dispatching to cuBLAS axpy/scal/dot/nrm2. - DeviceScalar<Scalar>: device-resident scalar returned by reductions. Defers host sync until value is read (implicit conversion). Device-side division via NPP for real types. - GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(), cusparseHandle(). - GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm selection and plan caching. - GpuSparseContext: GpuContext& constructor for same-stream execution, deviceView() returning DeviceSparseView with operator for device-resident SpMV (d_y = d_A * d_x). - geam expressions: d_C = d_A + alpha * d_B via cublasXgeam. - GpuSVD::matrixV() convenience wrapper. These additions make DeviceMatrix usable as a VectorType in Eigen algorithm templates. Conjugate gradient is the motivating example and is tested against CPU ConjugateGradient for correctness. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-09 19:54:13 -07:00			`CALL_SUBTEST(test_spmv_device<Scalar>(64));`
			`CALL_SUBTEST(test_spmv_expr<Scalar>(64));`
			`CALL_SUBTEST(test_deviceview_overwrite<Scalar>(64));`
GPU: Add sparse solvers, FFT, and SpMV (cuDSS, cuFFT, cuSPARSE) Add GPU sparse direct solvers (Cholesky, LDL^T, LU) via cuDSS, 1D/2D FFT via cuFFT with plan caching, and sparse matrix-vector/matrix multiply (SpMV/SpMM) via cuSPARSE. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-09 19:11:49 -07:00			`}`

			`EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {`
			`CALL_SUBTEST(test_scalar<float>());`
			`CALL_SUBTEST(test_scalar<double>());`
			`CALL_SUBTEST(test_scalar<std::complex<float>>());`
			`CALL_SUBTEST(test_scalar<std::complex<double>>());`
			`}`