GPU: Add BLAS-1 ops, DeviceScalar, device-resident SpMV, and CG interop (5/5)

Add the operator interface needed for GPU iterative solvers: - BLAS Level-1 on DeviceMatrix: dot(), norm(), squaredNorm(), setZero(), noalias(), operator+=/-=/\*= dispatching to cuBLAS axpy/scal/dot/nrm2. - DeviceScalar<Scalar>: device-resident scalar returned by reductions. Defers host sync until value is read (implicit conversion). Device-side division via NPP for real types. - GpuContext: stream-borrowing constructor, setThreadLocal(), cublasLtHandle(), cusparseHandle(). - GEMM upgraded from cublasGemmEx to cublasLtMatmul with heuristic algorithm selection and plan caching. - GpuSparseContext: GpuContext& constructor for same-stream execution, deviceView() returning DeviceSparseView with operator* for device-resident SpMV (d_y = d_A * d_x). - geam expressions: d_C = d_A + alpha * d_B via cublasXgeam. - GpuSVD::matrixV() convenience wrapper. These additions make DeviceMatrix usable as a VectorType in Eigen algorithm templates. Conjugate gradient is the motivating example and is tested against CPU ConjugateGradient for correctness. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-10 11:34:33 +08:00 · 2026-04-09 19:54:13 -07:00
parent 43a95b62bb
commit 014f12f11a
22 changed files with 3363 additions and 151 deletions
--- a/Eigen/GPU
+++ b/Eigen/GPU
@@ -39,6 +39,7 @@

 #ifdef EIGEN_USE_GPU
 // IWYU pragma: begin_exports
+#include "src/GPU/DeviceScalar.h"
 #include "src/GPU/DeviceMatrix.h"
 #include "src/GPU/GpuContext.h"
 #include "src/GPU/DeviceExpr.h"
@@ -53,8 +54,10 @@
 #include "src/GPU/CuFftSupport.h"
 #include "src/GPU/GpuFFT.h"
 #include "src/GPU/CuSparseSupport.h"
+#ifdef EIGEN_SPARSECORE_MODULE_H
 #include "src/GPU/GpuSparseContext.h"
-#ifdef EIGEN_CUDSS
+#endif
+#if defined(EIGEN_CUDSS) && defined(EIGEN_SPARSECORE_MODULE_H)
 #include "src/GPU/CuDssSupport.h"
 #include "src/GPU/GpuSparseSolverBase.h"
 #include "src/GPU/GpuSparseLLT.h"
--- a/Eigen/src/GPU/CuBlasSupport.h
+++ b/Eigen/src/GPU/CuBlasSupport.h
@@ -21,6 +21,7 @@

 #include "./GpuSupport.h"
 #include <cublas_v2.h>
+#include <cublasLt.h>

 namespace Eigen {
 namespace internal {
@@ -50,14 +51,14 @@ constexpr cublasOperation_t to_cublas_op(GpuOp op) {
 }

 // ---- Scalar → cublasComputeType_t -------------------------------------------
-// cublasGemmEx requires a compute type (separate from the data type).
+// cublasLtMatmul requires a compute type (separate from the data type).
 //
 // Precision policy:
-//   - Default: tensor core algorithms enabled (CUBLAS_GEMM_DEFAULT_TENSOR_OP).
+//   - Default: tensor core algorithms enabled via cublasLtMatmul heuristics.
 //     For double, cuBLAS may use Ozaki emulation on sm_80+ tensor cores.
 //   - EIGEN_CUDA_TF32: opt-in to TF32 for float (~2x faster, 10-bit mantissa).
 //   - EIGEN_NO_CUDA_TENSOR_OPS: disables all tensor core usage. Uses pedantic
-//     compute types and CUBLAS_GEMM_DEFAULT algorithm. For bit-exact reproducibility.
+//     compute types. For bit-exact reproducibility.

 template <typename Scalar>
 struct cuda_compute_type;
@@ -98,8 +99,47 @@ struct cuda_compute_type<std::complex<double>> {
  static constexpr cublasComputeType_t value = CUBLAS_COMPUTE_64F;
 #endif
 };
-// ---- GEMM algorithm hint ----------------------------------------------------
+// ---- Alpha/beta scalar type for cublasLtMatmul ------------------------------
+// For standard types, alpha/beta match the scalar type.

+template <typename Scalar>
+struct cuda_gemm_scalar {
+  using type = Scalar;
+};
+
+// ---- cublasLt GEMM dispatch -------------------------------------------------
+// Wraps cublasLtMatmul with descriptor setup, heuristic algorithm selection,
+// and lazy workspace management. Supports 64-bit dimensions natively.
+//
+// The workspace buffer (DeviceBuffer*) is grown lazily to match the selected
+// algorithm's actual requirement. The heuristic is queried with a generous
+// 32 MB cap so that the best algorithm is never excluded. Growth is monotonic:
+// the buffer only grows, never shrinks, so reallocation happens at most a few
+// times during the lifetime of the owning GpuContext or solver.
+//
+// EIGEN_NO_CUDA_TENSOR_OPS: pedantic compute types (CUBLAS_COMPUTE_32F_PEDANTIC,
+// CUBLAS_COMPUTE_64F_PEDANTIC) prevent cublasLt from selecting tensor core
+// algorithms, matching the previous cublasGemmEx behavior.
+//
+// Thread safety: the workspace buffer is not thread-safe. All GEMM calls
+// sharing a workspace must be on the same CUDA stream (guaranteed by GpuContext's
+// single-stream design and by each GpuSVD owning its own stream).
+//
+// Future optimization: for hot loops (e.g., CG iteration), caching descriptors
+// and the selected algorithm by (m, n, k, dtype, transA, transB) would avoid
+// per-call descriptor creation and heuristic lookup overhead.
+
+#define EIGEN_CUBLASLT_CHECK(expr)                                       \
+  do {                                                                   \
+    cublasStatus_t _s = (expr);                                          \
+    eigen_assert(_s == CUBLAS_STATUS_SUCCESS && "cuBLASLt call failed"); \
+  } while (0)
+
+// Maximum workspace the heuristic is allowed to consider. This is a preference
+// ceiling, not an allocation — actual allocation matches the selected algorithm.
+static constexpr size_t kCublasLtMaxWorkspaceBytes = 32 * 1024 * 1024;  // 32 MB
+
+// cublasGemmEx fallback algorithm hint (used when cublasLt heuristic returns no results).
 constexpr cublasGemmAlgo_t cuda_gemm_algo() {
 #ifdef EIGEN_NO_CUDA_TENSOR_OPS
  return CUBLAS_GEMM_DEFAULT;
@@ -108,13 +148,73 @@ constexpr cublasGemmAlgo_t cuda_gemm_algo() {
 #endif
 }

-// ---- Alpha/beta scalar type for cublasGemmEx --------------------------------
-// For standard types, alpha/beta match the scalar type.
-
 template <typename Scalar>
-struct cuda_gemm_scalar {
-  using type = Scalar;
-};
+void cublaslt_gemm(cublasLtHandle_t lt_handle, cublasHandle_t cublas_handle, cublasOperation_t transA,
+                   cublasOperation_t transB, int64_t m, int64_t n, int64_t k,
+                   const typename cuda_gemm_scalar<Scalar>::type* alpha, const Scalar* A, int64_t lda, const Scalar* B,
+                   int64_t ldb, const typename cuda_gemm_scalar<Scalar>::type* beta, Scalar* C, int64_t ldc,
+                   DeviceBuffer* workspace, cudaStream_t stream) {
+  constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
+  constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
+  using AlphaType = typename cuda_gemm_scalar<Scalar>::type;
+  constexpr cudaDataType_t alpha_type = cuda_data_type<AlphaType>::value;
+
+  // Matmul descriptor.
+  cublasLtMatmulDesc_t matmul_desc = nullptr;
+  EIGEN_CUBLASLT_CHECK(cublasLtMatmulDescCreate(&matmul_desc, compute, alpha_type));
+  EIGEN_CUBLASLT_CHECK(
+      cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, sizeof(transA)));
+  EIGEN_CUBLASLT_CHECK(
+      cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, sizeof(transB)));
+
+  // Matrix layout descriptors (column-major).
+  // Physical layout dimensions: rows × cols with leading dimension lda/ldb/ldc.
+  const int64_t a_rows = (transA == CUBLAS_OP_N) ? m : k;
+  const int64_t b_rows = (transB == CUBLAS_OP_N) ? k : n;
+
+  cublasLtMatrixLayout_t layout_A = nullptr, layout_B = nullptr, layout_C = nullptr;
+  EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_A, dtype, a_rows, (transA == CUBLAS_OP_N) ? k : m, lda));
+  EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_B, dtype, b_rows, (transB == CUBLAS_OP_N) ? n : k, ldb));
+  EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutCreate(&layout_C, dtype, m, n, ldc));
+
+  // Heuristic selection: query with generous workspace cap, allocate only what's needed.
+  cublasLtMatmulPreference_t preference = nullptr;
+  EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceCreate(&preference));
+  size_t max_ws = kCublasLtMaxWorkspaceBytes;
+  EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                                                            &max_ws, sizeof(max_ws)));
+
+  cublasLtMatmulHeuristicResult_t result;
+  int returned_results = 0;
+  cublasStatus_t heuristic_status = cublasLtMatmulAlgoGetHeuristic(lt_handle, matmul_desc, layout_A, layout_B, layout_C,
+                                                                   layout_C, preference, 1, &result, &returned_results);
+
+  if (heuristic_status == CUBLAS_STATUS_SUCCESS && returned_results > 0) {
+    // cublasLt path: use the selected algorithm with lazy workspace.
+    const size_t needed = result.workspaceSize;
+    if (needed > workspace->size()) {
+      // Sync only when freeing an existing buffer that may be in use.
+      if (workspace->ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
+      *workspace = DeviceBuffer(needed);
+    }
+
+    EIGEN_CUBLASLT_CHECK(cublasLtMatmul(lt_handle, matmul_desc, alpha, A, layout_A, B, layout_B, beta, C, layout_C, C,
+                                        layout_C, &result.algo, workspace->ptr, needed, stream));
+  } else {
+    // Fallback: cublasGemmEx for shapes/types that cublasLt cannot handle.
+    EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_handle, transA, transB, static_cast<int>(m), static_cast<int>(n),
+                                    static_cast<int>(k), alpha, A, dtype, static_cast<int>(lda), B, dtype,
+                                    static_cast<int>(ldb), beta, C, dtype, static_cast<int>(ldc), compute,
+                                    cuda_gemm_algo()));
+  }
+
+  // Cleanup descriptors.
+  EIGEN_CUBLASLT_CHECK(cublasLtMatmulPreferenceDestroy(preference));
+  EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_C));
+  EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_B));
+  EIGEN_CUBLASLT_CHECK(cublasLtMatrixLayoutDestroy(layout_A));
+  EIGEN_CUBLASLT_CHECK(cublasLtMatmulDescDestroy(matmul_desc));
+}

 // ---- Type-specific cuBLAS wrappers ------------------------------------------
 // cuBLAS uses separate functions per type (Strsm, Dtrsm, etc.).
@@ -227,6 +327,100 @@ inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transa, cu
                     reinterpret_cast<const cuDoubleComplex*>(B), ldb, reinterpret_cast<cuDoubleComplex*>(C), ldc);
 }

+// ---- cuBLAS Level-1 wrappers ------------------------------------------------
+// Type-dispatched wrappers for BLAS-1 vector operations: dot, axpy, nrm2, scal, copy.
+// These work with CUBLAS_POINTER_MODE_HOST or CUBLAS_POINTER_MODE_DEVICE depending
+// on the caller's configuration. For device pointer mode, scalar result pointers
+// (dot, nrm2) must point to device memory.
+
+// dot: result = x^T * y (real) or x^H * y (complex conjugate dot)
+inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const float* x, int incx, const float* y, int incy,
+                                 float* result) {
+  return cublasSdot(h, n, x, incx, y, incy, result);
+}
+inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const double* x, int incx, const double* y, int incy,
+                                 double* result) {
+  return cublasDdot(h, n, x, incx, y, incy, result);
+}
+inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const std::complex<float>* x, int incx,
+                                 const std::complex<float>* y, int incy, std::complex<float>* result) {
+  return cublasCdotc(h, n, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<const cuComplex*>(y), incy,
+                     reinterpret_cast<cuComplex*>(result));
+}
+inline cublasStatus_t cublasXdot(cublasHandle_t h, int n, const std::complex<double>* x, int incx,
+                                 const std::complex<double>* y, int incy, std::complex<double>* result) {
+  return cublasZdotc(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx,
+                     reinterpret_cast<const cuDoubleComplex*>(y), incy, reinterpret_cast<cuDoubleComplex*>(result));
+}
+
+// nrm2: result = ||x||_2 (always returns real)
+inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const float* x, int incx, float* result) {
+  return cublasSnrm2(h, n, x, incx, result);
+}
+inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const double* x, int incx, double* result) {
+  return cublasDnrm2(h, n, x, incx, result);
+}
+inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const std::complex<float>* x, int incx, float* result) {
+  return cublasScnrm2(h, n, reinterpret_cast<const cuComplex*>(x), incx, result);
+}
+inline cublasStatus_t cublasXnrm2(cublasHandle_t h, int n, const std::complex<double>* x, int incx, double* result) {
+  return cublasDznrm2(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx, result);
+}
+
+// axpy: y += alpha * x
+inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const float* alpha, const float* x, int incx, float* y,
+                                  int incy) {
+  return cublasSaxpy(h, n, alpha, x, incx, y, incy);
+}
+inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const double* alpha, const double* x, int incx, double* y,
+                                  int incy) {
+  return cublasDaxpy(h, n, alpha, x, incx, y, incy);
+}
+inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const std::complex<float>* alpha,
+                                  const std::complex<float>* x, int incx, std::complex<float>* y, int incy) {
+  return cublasCaxpy(h, n, reinterpret_cast<const cuComplex*>(alpha), reinterpret_cast<const cuComplex*>(x), incx,
+                     reinterpret_cast<cuComplex*>(y), incy);
+}
+inline cublasStatus_t cublasXaxpy(cublasHandle_t h, int n, const std::complex<double>* alpha,
+                                  const std::complex<double>* x, int incx, std::complex<double>* y, int incy) {
+  return cublasZaxpy(h, n, reinterpret_cast<const cuDoubleComplex*>(alpha), reinterpret_cast<const cuDoubleComplex*>(x),
+                     incx, reinterpret_cast<cuDoubleComplex*>(y), incy);
+}
+
+// scal: x *= alpha
+inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const float* alpha, float* x, int incx) {
+  return cublasSscal(h, n, alpha, x, incx);
+}
+inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const double* alpha, double* x, int incx) {
+  return cublasDscal(h, n, alpha, x, incx);
+}
+inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const std::complex<float>* alpha, std::complex<float>* x,
+                                  int incx) {
+  return cublasCscal(h, n, reinterpret_cast<const cuComplex*>(alpha), reinterpret_cast<cuComplex*>(x), incx);
+}
+inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const std::complex<double>* alpha, std::complex<double>* x,
+                                  int incx) {
+  return cublasZscal(h, n, reinterpret_cast<const cuDoubleComplex*>(alpha), reinterpret_cast<cuDoubleComplex*>(x),
+                     incx);
+}
+
+// copy: y = x
+inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const float* x, int incx, float* y, int incy) {
+  return cublasScopy(h, n, x, incx, y, incy);
+}
+inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const double* x, int incx, double* y, int incy) {
+  return cublasDcopy(h, n, x, incx, y, incy);
+}
+inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const std::complex<float>* x, int incx,
+                                  std::complex<float>* y, int incy) {
+  return cublasCcopy(h, n, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<cuComplex*>(y), incy);
+}
+inline cublasStatus_t cublasXcopy(cublasHandle_t h, int n, const std::complex<double>* x, int incx,
+                                  std::complex<double>* y, int incy) {
+  return cublasZcopy(h, n, reinterpret_cast<const cuDoubleComplex*>(x), incx, reinterpret_cast<cuDoubleComplex*>(y),
+                     incy);
+}
+
 }  // namespace internal
 }  // namespace Eigen

--- a/Eigen/src/GPU/DeviceDispatch.h
+++ b/Eigen/src/GPU/DeviceDispatch.h
@@ -29,10 +29,11 @@ namespace Eigen {
 namespace internal {

 // ---- GEMM dispatch ----------------------------------------------------------
-// GemmExpr<Lhs, Rhs> → cublasGemmEx(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)
+// GemmExpr<Lhs, Rhs> → cublasLtMatmul via GpuContext.
 //
-// The generic API cublasGemmEx handles all scalar types (float, double,
-// complex<float>, complex<double>) via cudaDataType_t.
+// Uses cublasLtMatmul for 64-bit dimension support and heuristic algorithm
+// selection. All scalar types (float, double, complex<float>, complex<double>)
+// are handled via cudaDataType_t.

 template <typename Lhs, typename Rhs>
 void dispatch_gemm(
@@ -46,6 +47,10 @@ void dispatch_gemm(
  const DeviceMatrix<Scalar>& A = traits_lhs::matrix(expr.lhs());
  const DeviceMatrix<Scalar>& B = traits_rhs::matrix(expr.rhs());

+  // cuBLAS GEMM: C must not alias A or B (undefined behavior).
+  eigen_assert(dst.data() != A.data() && "GEMM: output aliases left operand (use a temporary)");
+  eigen_assert(dst.data() != B.data() && "GEMM: output aliases right operand (use a temporary)");
+
  constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op);
  constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op);

@@ -89,13 +94,8 @@ void dispatch_gemm(
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
  }

-  constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
-  constexpr cublasComputeType_t compute = cuda_compute_type<Scalar>::value;
-
-  EIGEN_CUBLAS_CHECK(cublasGemmEx(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n),
-                                  static_cast<int>(k), &alpha_gval, A.data(), dtype, static_cast<int>(lda), B.data(),
-                                  dtype, static_cast<int>(ldb), &beta_gval, dst.data(), dtype, static_cast<int>(ldc),
-                                  compute, cuda_gemm_algo()));
+  cublaslt_gemm<Scalar>(ctx.cublasLtHandle(), ctx.cublasHandle(), transA, transB, m, n, k, &alpha_gval, A.data(), lda,
+                        B.data(), ldb, &beta_gval, dst.data(), ldc, ctx.gemmWorkspace(), ctx.stream());

  dst.recordReady(ctx.stream());
 }
@@ -504,6 +504,284 @@ void DeviceSelfAdjointView<Scalar_, UpLo_>::rankUpdate(const DeviceMatrix<Scalar
  internal::dispatch_syrk(GpuContext::threadLocal(), matrix(), expr, alpha, beta);
 }

+// ---- DeviceMatrix BLAS-1 out-of-line definitions ----------------------------
+// Defined here because they need the full GpuContext definition.
+// All methods take an explicit GpuContext& so callers can ensure same-stream
+// execution (zero event overhead when all operations share one context).
+//
+// Reduction methods (dot, norm, squaredNorm) use CUBLAS_POINTER_MODE_HOST:
+// the scalar result is written to host memory and cuBLAS synchronizes
+// internally before returning. This is necessary for Eigen template
+// compatibility — CG does `Scalar alpha = absNew / p.dot(tmp)` which
+// requires the host value immediately. A future GPU CG implementation
+// that controls the iteration loop can use CUBLAS_POINTER_MODE_DEVICE
+// to batch multiple reductions into a single sync point.
+
+template <typename Scalar_>
+DeviceScalar<typename DeviceMatrix<Scalar_>::Scalar> DeviceMatrix<Scalar_>::dot(GpuContext& ctx,
+                                                                                const DeviceMatrix& other) const {
+  const int n = static_cast<int>(rows_ * cols_);
+  eigen_assert(n == static_cast<int>(other.rows_ * other.cols_));
+  DeviceScalar<Scalar> result(Scalar(0), ctx.stream());
+  if (n > 0) {
+    waitReady(ctx.stream());
+    other.waitReady(ctx.stream());
+    cublasPointerMode_t prev;
+    EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
+    EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
+    EIGEN_CUBLAS_CHECK(internal::cublasXdot(ctx.cublasHandle(), n, data_, 1, other.data_, 1, result.devicePtr()));
+    EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
+  }
+  return result;
+}
+
+namespace internal {
+// Real: dot(x,x) returns DeviceScalar<Scalar> which IS DeviceScalar<RealScalar>.
+// Move-construct without any sync.
+template <typename Scalar, typename RealScalar>
+typename std::enable_if<std::is_same<Scalar, RealScalar>::value, DeviceScalar<RealScalar>>::type squaredNorm_from_dot(
+    DeviceScalar<Scalar>&& d, cudaStream_t) {
+  return std::move(d);
+}
+// Complex: must sync to extract the real part (DeviceScalar arithmetic is real-only).
+template <typename Scalar, typename RealScalar>
+typename std::enable_if<!std::is_same<Scalar, RealScalar>::value, DeviceScalar<RealScalar>>::type squaredNorm_from_dot(
+    DeviceScalar<Scalar>&& d, cudaStream_t stream) {
+  return DeviceScalar<RealScalar>(numext::real(Scalar(d)), stream);
+}
+}  // namespace internal
+
+template <typename Scalar_>
+DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::squaredNorm(GpuContext& ctx) const {
+  // Use dot(x,x) instead of nrm2()^2: dot kernel is ~4.5x faster than nrm2
+  // (nrm2 uses a numerically careful scaled-sum-of-squares algorithm that is
+  // unnecessary for CG convergence checks).
+  using RealScalar = typename NumTraits<Scalar_>::Real;
+  return internal::squaredNorm_from_dot<Scalar_, RealScalar>(dot(ctx, *this), ctx.stream());
+}
+
+template <typename Scalar_>
+DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::norm(GpuContext& ctx) const {
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  const int n = static_cast<int>(rows_ * cols_);
+  DeviceScalar<RealScalar> result(RealScalar(0), ctx.stream());
+  if (n > 0) {
+    waitReady(ctx.stream());
+    cublasPointerMode_t prev;
+    EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
+    EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
+    EIGEN_CUBLAS_CHECK(internal::cublasXnrm2(ctx.cublasHandle(), n, data_, 1, result.devicePtr()));
+    EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
+  }
+  return result;
+}
+
+template <typename Scalar_>
+void DeviceMatrix<Scalar_>::setZero(GpuContext& ctx) {
+  if (sizeInBytes() > 0) {
+    waitReady(ctx.stream());
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(data_, 0, sizeInBytes(), ctx.stream()));
+    recordReady(ctx.stream());
+  }
+}
+
+template <typename Scalar_>
+void DeviceMatrix<Scalar_>::addScaled(GpuContext& ctx, Scalar alpha, const DeviceMatrix& x) {
+  const int n = static_cast<int>(rows_ * cols_);
+  eigen_assert(n == static_cast<int>(x.rows_ * x.cols_));
+  if (n > 0) {
+    waitReady(ctx.stream());
+    x.waitReady(ctx.stream());
+    EIGEN_CUBLAS_CHECK(internal::cublasXaxpy(ctx.cublasHandle(), n, &alpha, x.data_, 1, data_, 1));
+    recordReady(ctx.stream());
+  }
+}
+
+template <typename Scalar_>
+void DeviceMatrix<Scalar_>::scale(GpuContext& ctx, Scalar alpha) {
+  const int n = static_cast<int>(rows_ * cols_);
+  if (n > 0) {
+    waitReady(ctx.stream());
+    EIGEN_CUBLAS_CHECK(internal::cublasXscal(ctx.cublasHandle(), n, &alpha, data_, 1));
+    recordReady(ctx.stream());
+  }
+}
+
+template <typename Scalar_>
+void DeviceMatrix<Scalar_>::copyFrom(GpuContext& ctx, const DeviceMatrix& other) {
+  // Wait on *this before resize — resize may free the old buffer while another
+  // stream is still reading it.
+  if (!empty()) waitReady(ctx.stream());
+  resize(other.rows_, other.cols_);
+  const int n = static_cast<int>(rows_ * cols_);
+  if (n > 0) {
+    other.waitReady(ctx.stream());
+    EIGEN_CUBLAS_CHECK(internal::cublasXcopy(ctx.cublasHandle(), n, other.data_, 1, data_, 1));
+    recordReady(ctx.stream());
+  }
+}
+
+// ---- BLAS-1 operator overloads for CG compatibility -------------------------
+
+// this += alpha * x  (axpy)
+template <typename Scalar_>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceScaled<DeviceMatrix>& expr) {
+  addScaled(GpuContext::threadLocal(), expr.scalar(), internal::device_expr_traits<DeviceMatrix>::matrix(expr.inner()));
+  return *this;
+}
+
+// this -= alpha * x  (axpy with negated alpha)
+template <typename Scalar_>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceScaled<DeviceMatrix>& expr) {
+  addScaled(GpuContext::threadLocal(), -expr.scalar(),
+            internal::device_expr_traits<DeviceMatrix>::matrix(expr.inner()));
+  return *this;
+}
+
+// this += x  (axpy with alpha=1)
+template <typename Scalar_>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceMatrix& other) {
+  Scalar one(1);
+  addScaled(GpuContext::threadLocal(), one, other);
+  return *this;
+}
+
+// this -= x  (axpy with alpha=-1)
+template <typename Scalar_>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceMatrix& other) {
+  Scalar neg_one(-1);
+  addScaled(GpuContext::threadLocal(), neg_one, other);
+  return *this;
+}
+
+// this *= alpha  (scal, host pointer)
+template <typename Scalar_>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator*=(Scalar alpha) {
+  scale(GpuContext::threadLocal(), alpha);
+  return *this;
+}
+
+// this *= alpha  (scal, device pointer — avoids host sync)
+template <typename Scalar_>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator*=(const DeviceScalar<Scalar>& alpha) {
+  const int n = static_cast<int>(rows_ * cols_);
+  if (n > 0) {
+    auto& ctx = GpuContext::threadLocal();
+    waitReady(ctx.stream());
+    cublasPointerMode_t prev;
+    EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
+    EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
+    EIGEN_CUBLAS_CHECK(internal::cublasXscal(ctx.cublasHandle(), n, alpha.devicePtr(), data_, 1));
+    EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
+    recordReady(ctx.stream());
+  }
+  return *this;
+}
+
+// this += DeviceScalar * x  (axpy with CUBLAS_POINTER_MODE_DEVICE)
+template <typename Scalar_>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const DeviceScaledDevice<Scalar_>& expr) {
+  const int n = static_cast<int>(rows_ * cols_);
+  const auto& x = expr.matrix();
+  eigen_assert(n == static_cast<int>(x.rows_ * x.cols_));
+  if (n > 0) {
+    auto& ctx = GpuContext::threadLocal();
+    waitReady(ctx.stream());
+    x.waitReady(ctx.stream());
+    cublasPointerMode_t prev;
+    EIGEN_CUBLAS_CHECK(cublasGetPointerMode(ctx.cublasHandle(), &prev));
+    EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), CUBLAS_POINTER_MODE_DEVICE));
+    EIGEN_CUBLAS_CHECK(internal::cublasXaxpy(ctx.cublasHandle(), n, expr.alpha().devicePtr(), x.data_, 1, data_, 1));
+    EIGEN_CUBLAS_CHECK(cublasSetPointerMode(ctx.cublasHandle(), prev));
+    recordReady(ctx.stream());
+  }
+  return *this;
+}
+
+// this -= DeviceScalar * x  (axpy with negated device scalar)
+template <typename Scalar_>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator-=(const DeviceScaledDevice<Scalar_>& expr) {
+  auto neg_alpha = -expr.alpha();
+  DeviceScaledDevice<Scalar_> neg_expr(neg_alpha, expr.matrix());
+  return operator+=(neg_expr);
+}
+
+// this = alpha * A + beta * B  (cuBLAS geam)
+template <typename Scalar_>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const DeviceAddExpr<Scalar_>& expr) {
+  auto& ctx = GpuContext::threadLocal();
+  const auto& A = expr.A();
+  const auto& B = expr.B();
+  eigen_assert(A.rows() == B.rows() && A.cols() == B.cols());
+  const int m = static_cast<int>(A.rows());
+  const int n = static_cast<int>(A.cols());
+  // Wait on *this before resize — resize may free the old buffer while another
+  // stream is still reading it.
+  if (!empty()) waitReady(ctx.stream());
+  resize(A.rows(), A.cols());
+  if (m > 0 && n > 0) {
+    A.waitReady(ctx.stream());
+    B.waitReady(ctx.stream());
+    Scalar_ alpha = expr.alpha();
+    Scalar_ beta = expr.beta();
+    EIGEN_CUBLAS_CHECK(internal::cublasXgeam(ctx.cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, m, n, &alpha, A.data(), m,
+                                             &beta, B.data(), m, data_, m));
+    recordReady(ctx.stream());
+  }
+  return *this;
+}
+
+// cwiseProduct via NPP nppsMul (allocating).
+template <typename Scalar_>
+DeviceMatrix<Scalar_> DeviceMatrix<Scalar_>::cwiseProduct(GpuContext& ctx, const DeviceMatrix& other) const {
+  const int n = static_cast<int>(rows_ * cols_);
+  eigen_assert(n == static_cast<int>(other.rows_ * other.cols_));
+  DeviceMatrix result(rows_, cols_);
+  if (n > 0) {
+    waitReady(ctx.stream());
+    other.waitReady(ctx.stream());
+    internal::device_cwiseProduct(data_, other.data_, result.data_, n, ctx.stream());
+    result.recordReady(ctx.stream());
+  }
+  return result;
+}
+
+// In-place cwiseProduct: this = a .* b (reuses this buffer, no allocation).
+template <typename Scalar_>
+void DeviceMatrix<Scalar_>::cwiseProduct(GpuContext& ctx, const DeviceMatrix& a, const DeviceMatrix& b) {
+  const int n = static_cast<int>(a.rows_ * a.cols_);
+  eigen_assert(n == static_cast<int>(b.rows_ * b.cols_));
+  if (!empty()) waitReady(ctx.stream());
+  resize(a.rows_, a.cols_);
+  if (n > 0) {
+    a.waitReady(ctx.stream());
+    b.waitReady(ctx.stream());
+    internal::device_cwiseProduct(a.data_, b.data_, data_, n, ctx.stream());
+    recordReady(ctx.stream());
+  }
+}
+
+// Convenience overloads using thread-local default GpuContext.
+template <typename Scalar_>
+DeviceScalar<typename DeviceMatrix<Scalar_>::Scalar> DeviceMatrix<Scalar_>::dot(const DeviceMatrix& other) const {
+  return dot(GpuContext::threadLocal(), other);
+}
+
+template <typename Scalar_>
+DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::squaredNorm() const {
+  return squaredNorm(GpuContext::threadLocal());
+}
+
+template <typename Scalar_>
+DeviceScalar<typename NumTraits<Scalar_>::Real> DeviceMatrix<Scalar_>::norm() const {
+  return norm(GpuContext::threadLocal());
+}
+
+template <typename Scalar_>
+void DeviceMatrix<Scalar_>::setZero() {
+  setZero(GpuContext::threadLocal());
+}
+
 }  // namespace Eigen

 #endif  // EIGEN_GPU_DEVICE_DISPATCH_H
--- a/Eigen/src/GPU/DeviceExpr.h
+++ b/Eigen/src/GPU/DeviceExpr.h
@@ -219,6 +219,87 @@ DeviceScaled<DeviceTransposeView<S>> operator*(S alpha, const DeviceTransposeVie
  return {alpha, m};
 }

+// ---- DeviceScaledDevice: DeviceScalar * DeviceMatrix → device-pointer axpy ---
+// Like DeviceScaled but carries a DeviceScalar (device pointer) instead of
+// a host scalar. operator+= dispatches to cuBLAS axpy with POINTER_MODE_DEVICE.
+
+template <typename Scalar_>
+class DeviceScaledDevice {
+ public:
+  using Scalar = Scalar_;
+  DeviceScaledDevice(const DeviceScalar<Scalar>& alpha, const DeviceMatrix<Scalar>& mat) : alpha_(alpha), mat_(mat) {}
+  const DeviceScalar<Scalar>& alpha() const { return alpha_; }
+  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
+
+ private:
+  const DeviceScalar<Scalar>& alpha_;
+  const DeviceMatrix<Scalar>& mat_;
+};
+
+// DeviceScalar * DeviceMatrix → DeviceScaledDevice
+template <typename S>
+DeviceScaledDevice<S> operator*(const DeviceScalar<S>& alpha, const DeviceMatrix<S>& m) {
+  return {alpha, m};
+}
+
+// ---- DeviceAddExpr: a + b → cublasXgeam -------------------------------------
+// Captures `DeviceMatrix + DeviceScaled<DeviceMatrix>` (and reverse).
+// Dispatched to geam: C = alpha * A + beta * B.
+//
+// Note: These operator+/- overloads are intentionally free functions on
+// DeviceMatrix, not Eigen expression templates. DeviceMatrix does not inherit
+// from MatrixBase, so there is no ambiguity with Eigen's own operator+/-.
+// If DeviceMatrix is ever made an Eigen expression type, these would need to
+// be revisited.
+
+template <typename Scalar_>
+class DeviceAddExpr {
+ public:
+  using Scalar = Scalar_;
+  DeviceAddExpr(Scalar alpha, const DeviceMatrix<Scalar>& A, Scalar beta, const DeviceMatrix<Scalar>& B)
+      : alpha_(alpha), A_(A), beta_(beta), B_(B) {}
+  Scalar alpha() const { return alpha_; }
+  Scalar beta() const { return beta_; }
+  const DeviceMatrix<Scalar>& A() const { return A_; }
+  const DeviceMatrix<Scalar>& B() const { return B_; }
+
+ private:
+  Scalar alpha_;
+  const DeviceMatrix<Scalar>& A_;
+  Scalar beta_;
+  const DeviceMatrix<Scalar>& B_;
+};
+
+// DeviceMatrix + DeviceMatrix → DeviceAddExpr (alpha=1, beta=1)
+template <typename S>
+DeviceAddExpr<S> operator+(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
+  return {S(1), a, S(1), b};
+}
+
+// DeviceMatrix + DeviceScaled<DeviceMatrix> → DeviceAddExpr (alpha=1, beta=scaled)
+template <typename S>
+DeviceAddExpr<S> operator+(const DeviceMatrix<S>& a, const DeviceScaled<DeviceMatrix<S>>& b) {
+  return {S(1), a, b.scalar(), b.inner()};
+}
+
+// DeviceScaled<DeviceMatrix> + DeviceMatrix → DeviceAddExpr (alpha=scaled, beta=1)
+template <typename S>
+DeviceAddExpr<S> operator+(const DeviceScaled<DeviceMatrix<S>>& a, const DeviceMatrix<S>& b) {
+  return {a.scalar(), a.inner(), S(1), b};
+}
+
+// DeviceMatrix - DeviceMatrix → DeviceAddExpr (alpha=1, beta=-1)
+template <typename S>
+DeviceAddExpr<S> operator-(const DeviceMatrix<S>& a, const DeviceMatrix<S>& b) {
+  return {S(1), a, S(-1), b};
+}
+
+// DeviceMatrix - DeviceScaled<DeviceMatrix> → DeviceAddExpr (alpha=1, beta=-scaled)
+template <typename S>
+DeviceAddExpr<S> operator-(const DeviceMatrix<S>& a, const DeviceScaled<DeviceMatrix<S>>& b) {
+  return {S(1), a, -b.scalar(), b.inner()};
+}
+
 }  // namespace Eigen

 #endif  // EIGEN_GPU_DEVICE_EXPR_H
--- a/Eigen/src/GPU/DeviceMatrix.h
+++ b/Eigen/src/GPU/DeviceMatrix.h
@@ -53,6 +53,16 @@ template <typename>
 class DeviceAssignment;
 template <typename, typename>
 class GemmExpr;
+template <typename>
+class DeviceScaled;
+template <typename>
+class SpMVExpr;
+template <typename>
+class DeviceAddExpr;
+template <typename>
+class DeviceScaledDevice;
+template <typename>
+class DeviceScalar;
 template <typename, int>
 class LltSolveExpr;
 template <typename>
@@ -170,6 +180,8 @@ template <typename Scalar_>
 class DeviceMatrix {
 public:
  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using PlainObject = DeviceMatrix;  // owning type (for CG template compatibility)
  using PlainMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;

  // ---- Construction / destruction ------------------------------------------
@@ -177,6 +189,16 @@ class DeviceMatrix {
  /** Default: empty (0x0, no allocation). */
  DeviceMatrix() = default;

+  /** Allocate uninitialized column vector of given size.
+   * Matches Matrix<Scalar,Dynamic,1>(n) for CG template compatibility. */
+  explicit DeviceMatrix(Index n) : rows_(n), cols_(1) {
+    eigen_assert(n >= 0);
+    size_t bytes = sizeInBytes();
+    if (bytes > 0) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(reinterpret_cast<void**>(&data_), bytes));
+    }
+  }
+
  /** Allocate uninitialized device memory for a rows x cols matrix. */
  DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols) {
    eigen_assert(rows >= 0 && cols >= 0);
@@ -446,6 +468,104 @@ class DeviceMatrix {
  template <int UpLo>
  DeviceMatrix& operator=(const SymmExpr<Scalar, UpLo>& expr);

+  // ---- BLAS Level-1 operations ----------------------------------------------
+  // DeviceMatrix is always dense (lda == rows), so a vector is simply a
+  // DeviceMatrix with cols == 1. These BLAS-1 methods operate on the flat
+  // rows*cols element array, making them work for both vectors and matrices.
+  //
+  // All methods take an explicit GpuContext& for stream/handle control.
+  // When everything uses the same context, event waits are skipped (same-stream).
+  // Defined out-of-line in DeviceDispatch.h (needs GpuContext).
+
+  /** Dot product: this^H * other. Returns DeviceScalar — the result stays
+   * on device until read via implicit conversion to Scalar (which syncs).
+   * When used with `auto`, no sync occurs until the value is needed. */
+  DeviceScalar<Scalar> dot(GpuContext& ctx, const DeviceMatrix& other) const;
+
+  /** Squared L2 norm via dot(x, x). Returns DeviceScalar (no sync until read).
+   * For real types, the result stays on device. For complex types, falls back
+   * to host sync (DeviceScalar arithmetic is real-only). */
+  DeviceScalar<typename NumTraits<Scalar>::Real> squaredNorm(GpuContext& ctx) const;
+
+  /** L2 norm. Returns DeviceScalar (no host sync). */
+  DeviceScalar<typename NumTraits<Scalar>::Real> norm(GpuContext& ctx) const;
+
+  /** Set all elements to zero. */
+  void setZero(GpuContext& ctx);
+
+  /** this += alpha * x (cuBLAS axpy). Requires same total size. */
+  void addScaled(GpuContext& ctx, Scalar alpha, const DeviceMatrix& x);
+
+  /** this *= alpha (cuBLAS scal). */
+  void scale(GpuContext& ctx, Scalar alpha);
+
+  /** Deep copy: this = other (cuBLAS copy). Resizes if needed. */
+  void copyFrom(GpuContext& ctx, const DeviceMatrix& other);
+
+  // Convenience overloads using the thread-local default GpuContext.
+  DeviceScalar<Scalar> dot(const DeviceMatrix& other) const;
+  DeviceScalar<typename NumTraits<Scalar>::Real> squaredNorm() const;
+  DeviceScalar<typename NumTraits<Scalar>::Real> norm() const;
+  void setZero();
+
+  // ---- BLAS-1 operator overloads for CG/iterative solver compatibility ------
+  // These allow CG code like `x += alpha * p` to work with DeviceMatrix.
+  // `alpha * DeviceMatrix` already returns `DeviceScaled<DeviceMatrix<Scalar>>`
+  // (defined in DeviceExpr.h). These operators dispatch to cuBLAS axpy/scal.
+  // Defined out-of-line in DeviceDispatch.h.
+
+  /** this += alpha * x (cuBLAS axpy). For `x += alpha * p`. */
+  DeviceMatrix& operator+=(const DeviceScaled<DeviceMatrix>& expr);
+
+  /** this -= alpha * x (cuBLAS axpy with negated alpha). For `r -= alpha * tmp`. */
+  DeviceMatrix& operator-=(const DeviceScaled<DeviceMatrix>& expr);
+
+  /** this += x (cuBLAS axpy with alpha=1). */
+  DeviceMatrix& operator+=(const DeviceMatrix& other);
+
+  /** this -= x (cuBLAS axpy with alpha=-1). */
+  DeviceMatrix& operator-=(const DeviceMatrix& other);
+
+  /** this *= alpha (cuBLAS scal, host pointer mode). For `p *= beta`. */
+  DeviceMatrix& operator*=(Scalar alpha);
+
+  /** this *= alpha (cuBLAS scal, device pointer mode). Avoids host sync. */
+  DeviceMatrix& operator*=(const DeviceScalar<Scalar>& alpha);
+
+  /** Element-wise product: result[i] = this[i] * other[i] (NPP nppsMul).
+   * Returns a new DeviceMatrix. Defined out-of-line in DeviceDispatch.h. */
+  DeviceMatrix cwiseProduct(GpuContext& ctx, const DeviceMatrix& other) const;
+
+  /** In-place element-wise product: this[i] = a[i] * b[i] (NPP nppsMul).
+   * Reuses this matrix's buffer when sizes match, avoiding cudaMalloc. */
+  void cwiseProduct(GpuContext& ctx, const DeviceMatrix& a, const DeviceMatrix& b);
+
+  /** this += DeviceScalar * x (cuBLAS axpy with POINTER_MODE_DEVICE). */
+  DeviceMatrix& operator+=(const DeviceScaledDevice<Scalar>& expr);
+
+  /** this -= DeviceScalar * x (cuBLAS axpy with negated device scalar). */
+  DeviceMatrix& operator-=(const DeviceScaledDevice<Scalar>& expr);
+
+  /** Assign from an SpMV expression: d_y = d_A * d_x. */
+  DeviceMatrix& operator=(const SpMVExpr<Scalar>& expr);
+
+  /** Assign from an add expression: d_C = alpha * d_A + beta * d_B (cuBLAS geam). */
+  DeviceMatrix& operator=(const DeviceAddExpr<Scalar>& expr);
+
+  /** No-op — all DeviceMatrix operations are implicitly noalias.
+   *
+   * Unlike Eigen's Matrix, where omitting .noalias() triggers a copy to a
+   * temporary for safety, DeviceMatrix dispatches directly to NVIDIA library
+   * calls which have no built-in aliasing protection. Every assignment
+   * (`d_C = d_A * d_B`, `d_y = d_A * d_x`, etc.) behaves as if .noalias()
+   * were specified. The caller must ensure operands don't alias the
+   * destination for GEMM and SpMV. geam (`d_C = d_A + alpha * d_B`) is
+   * safe with aliasing. Debug asserts catch violations.
+   *
+   * This method exists so that `tmp.noalias() = mat * p` compiles for both
+   * Matrix and DeviceMatrix. */
+  DeviceMatrix& noalias() { return *this; }
+
 private:
  // ---- Private: adopt a raw device pointer (used by friend solvers) --------

--- a/Eigen/src/GPU/DeviceScalar.h
+++ b/Eigen/src/GPU/DeviceScalar.h
@@ -0,0 +1,121 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Device-resident scalar for deferred host synchronization.
+//
+// DeviceScalar<Scalar> wraps a single value in device memory. Reductions
+// (dot, nrm2) write results directly to device memory via
+// CUBLAS_POINTER_MODE_DEVICE, deferring host sync until the value is read.
+//
+// Implicit conversion to Scalar triggers cudaStreamSynchronize + download.
+// In CG, this reduces 3 syncs/iter to effectively 1: the first conversion
+// syncs the stream, subsequent conversions in the same expression just
+// download (the stream is already flushed).
+//
+// Usage:
+//   auto dot_val = d_x.dot(d_y);       // DeviceScalar, no sync
+//   auto norm_val = d_r.squaredNorm();  // DeviceScalar, no sync
+//   Scalar alpha = absNew / dot_val;    // sync here (both values downloaded)
+//   d_x += alpha * d_p;                 // host-scalar axpy (as before)
+
+#ifndef EIGEN_GPU_DEVICE_SCALAR_H
+#define EIGEN_GPU_DEVICE_SCALAR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSupport.h"
+#include "./DeviceScalarOps.h"
+
+namespace Eigen {
+
+template <typename Scalar_>
+class DeviceScalar {
+ public:
+  using Scalar = Scalar_;
+
+  /** Allocate uninitialized device scalar. Contents are undefined until written
+   * (e.g., by cuBLAS dot/nrm2 with POINTER_MODE_DEVICE). Consistent with
+   * DeviceMatrix(rows, cols) which also does not zero-initialize. */
+  explicit DeviceScalar(cudaStream_t stream = nullptr) : d_val_(sizeof(Scalar)), stream_(stream) {}
+
+  DeviceScalar(Scalar host_val, cudaStream_t stream) : d_val_(sizeof(Scalar)), stream_(stream) {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_val_.ptr, &host_val, sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
+  }
+
+  DeviceScalar(DeviceScalar&& o) noexcept : d_val_(std::move(o.d_val_)), stream_(o.stream_) { o.stream_ = nullptr; }
+
+  DeviceScalar& operator=(DeviceScalar&& o) noexcept {
+    if (this != &o) {
+      d_val_ = std::move(o.d_val_);
+      stream_ = o.stream_;
+      o.stream_ = nullptr;
+    }
+    return *this;
+  }
+
+  DeviceScalar(const DeviceScalar&) = delete;
+  DeviceScalar& operator=(const DeviceScalar&) = delete;
+
+  /** Download from device. Synchronizes the stream on first call;
+   * subsequent calls in the same expression are cheap (stream already flushed). */
+  Scalar get() const {
+    Scalar result;
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&result, d_val_.ptr, sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    return result;
+  }
+
+  /** Implicit conversion — allows `Scalar alpha = deviceScalar` and
+   * `if (deviceScalar < threshold)`. Triggers sync. */
+  operator Scalar() const { return get(); }
+
+  Scalar* devicePtr() { return static_cast<Scalar*>(d_val_.ptr); }
+  const Scalar* devicePtr() const { return static_cast<const Scalar*>(d_val_.ptr); }
+  cudaStream_t stream() const { return stream_; }
+
+  // ---- Device-side arithmetic (no host sync) ---------------------------------
+  // Uses NPP from DeviceScalarOps.h. All results stay on device.
+  // Currently supports real types only (float, double). Complex types
+  // fall back to implicit conversion (host sync) for division.
+  //
+  // Note: DeviceScalar has no cross-stream readiness tracking. All
+  // operations must be on the same CUDA stream. This is the natural
+  // pattern in iterative solvers where one GpuContext owns all work.
+
+  friend DeviceScalar operator/(const DeviceScalar& a, const DeviceScalar& b) {
+    DeviceScalar result(a.stream_);
+    internal::device_scalar_div(a.devicePtr(), b.devicePtr(), result.devicePtr(), a.stream_);
+    return result;
+  }
+
+  friend DeviceScalar operator/(Scalar a, const DeviceScalar& b) {
+    DeviceScalar d_a(a, b.stream_);
+    return d_a / b;
+  }
+
+  friend DeviceScalar operator/(const DeviceScalar& a, Scalar b) {
+    DeviceScalar d_b(b, a.stream_);
+    return a / d_b;
+  }
+
+  DeviceScalar operator-() const {
+    DeviceScalar result(stream_);
+    internal::device_scalar_neg(devicePtr(), result.devicePtr(), stream_);
+    return result;
+  }
+
+ private:
+  internal::DeviceBuffer d_val_;
+  cudaStream_t stream_ = nullptr;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_DEVICE_SCALAR_H
--- a/Eigen/src/GPU/DeviceScalarOps.h
+++ b/Eigen/src/GPU/DeviceScalarOps.h
@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Device-resident scalar and element-wise operations via NPP signals.
+// Header-only — no custom CUDA kernels needed. Uses nppsDiv, nppsMul,
+// nppsMulC from the NPP library (CUDA::npps, part of the CUDA toolkit).
+
+#ifndef EIGEN_GPU_DEVICE_SCALAR_OPS_H
+#define EIGEN_GPU_DEVICE_SCALAR_OPS_H
+
+#include <cuda_runtime.h>
+#include <npps_arithmetic_and_logical_operations.h>
+
+namespace Eigen {
+namespace internal {
+
+// ---- NppStreamContext helper ------------------------------------------------
+
+inline NppStreamContext make_npp_stream_ctx(cudaStream_t stream) {
+  // Cache device attributes (constant for process lifetime) in a thread-local.
+  // Only the stream and its flags vary per call.
+  struct CachedDeviceInfo {
+    bool initialized = false;
+    int device_id = 0;
+    int cc_major = 0;
+    int cc_minor = 0;
+    int mp_count = 0;
+    int max_threads_per_mp = 0;
+    int max_threads_per_block = 0;
+    int shared_mem_per_block = 0;
+
+    void init() {
+      if (initialized) return;
+      cudaGetDevice(&device_id);
+      cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device_id);
+      cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device_id);
+      cudaDeviceGetAttribute(&mp_count, cudaDevAttrMultiProcessorCount, device_id);
+      cudaDeviceGetAttribute(&max_threads_per_mp, cudaDevAttrMaxThreadsPerMultiProcessor, device_id);
+      cudaDeviceGetAttribute(&max_threads_per_block, cudaDevAttrMaxThreadsPerBlock, device_id);
+      cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id);
+      initialized = true;
+    }
+  };
+  thread_local CachedDeviceInfo cached;
+  cached.init();
+
+  NppStreamContext ctx = {};
+  ctx.hStream = stream;
+  ctx.nCudaDeviceId = cached.device_id;
+  ctx.nCudaDevAttrComputeCapabilityMajor = cached.cc_major;
+  ctx.nCudaDevAttrComputeCapabilityMinor = cached.cc_minor;
+  ctx.nMultiProcessorCount = cached.mp_count;
+  ctx.nMaxThreadsPerMultiProcessor = cached.max_threads_per_mp;
+  ctx.nMaxThreadsPerBlock = cached.max_threads_per_block;
+  ctx.nSharedMemPerBlock = cached.shared_mem_per_block;
+  cudaStreamGetFlags(stream, &ctx.nStreamFlags);
+  return ctx;
+}
+
+// ---- Scalar division: c = a / b (device-resident, async) --------------------
+
+inline void device_scalar_div(const float* a, const float* b, float* c, cudaStream_t stream) {
+  NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
+  nppsDiv_32f_Ctx(b, a, c, 1, npp_ctx);  // NPP: pDst[i] = pSrc2[i] / pSrc1[i]
+}
+
+inline void device_scalar_div(const double* a, const double* b, double* c, cudaStream_t stream) {
+  NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
+  nppsDiv_64f_Ctx(b, a, c, 1, npp_ctx);  // NPP: pDst[i] = pSrc2[i] / pSrc1[i]
+}
+
+// ---- Scalar negation: c = -a (device-resident, async) -----------------------
+
+inline void device_scalar_neg(const float* a, float* c, cudaStream_t stream) {
+  NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
+  nppsMulC_32f_Ctx(a, -1.0f, c, 1, npp_ctx);
+}
+
+inline void device_scalar_neg(const double* a, double* c, cudaStream_t stream) {
+  NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
+  nppsMulC_64f_Ctx(a, -1.0, c, 1, npp_ctx);
+}
+
+// ---- Element-wise vector multiply: c[i] = a[i] * b[i] ----------------------
+
+inline void device_cwiseProduct(const float* a, const float* b, float* c, int n, cudaStream_t stream) {
+  NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
+  nppsMul_32f_Ctx(a, b, c, static_cast<size_t>(n), npp_ctx);
+}
+
+inline void device_cwiseProduct(const double* a, const double* b, double* c, int n, cudaStream_t stream) {
+  NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
+  nppsMul_64f_Ctx(a, b, c, static_cast<size_t>(n), npp_ctx);
+}
+
+// ---- Element-wise vector division: c[i] = a[i] / b[i] ----------------------
+
+inline void device_cwiseQuotient(const float* a, const float* b, float* c, int n, cudaStream_t stream) {
+  NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
+  nppsDiv_32f_Ctx(b, a, c, static_cast<size_t>(n), npp_ctx);  // NPP: dst = src2 / src1
+}
+
+inline void device_cwiseQuotient(const double* a, const double* b, double* c, int n, cudaStream_t stream) {
+  NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
+  nppsDiv_64f_Ctx(b, a, c, static_cast<size_t>(n), npp_ctx);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_DEVICE_SCALAR_OPS_H
--- a/Eigen/src/GPU/GpuContext.h
+++ b/Eigen/src/GPU/GpuContext.h
@@ -28,6 +28,8 @@

 #include "./CuBlasSupport.h"
 #include "./CuSolverSupport.h"
+#include <cusparse.h>
+#include <cufft.h>

 namespace Eigen {

@@ -44,38 +46,92 @@ namespace Eigen {
 */
 class GpuContext {
 public:
-  GpuContext() {
+  /** Create a new context with a dedicated CUDA stream. */
+  GpuContext() : owns_stream_(true) {
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
-    EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
-    EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
-    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
-    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
+    init_handles();
  }

+  /** Create a context on an existing stream (e.g., stream 0 = nullptr).
+   * The caller retains ownership of the stream — this context will not destroy it. */
+  explicit GpuContext(cudaStream_t stream) : stream_(stream), owns_stream_(false) { init_handles(); }
+
  ~GpuContext() {
+    if (cusparse_) (void)cusparseDestroy(cusparse_);
    if (cusolver_) (void)cusolverDnDestroy(cusolver_);
+    if (cublas_lt_) (void)cublasLtDestroy(cublas_lt_);
    if (cublas_) (void)cublasDestroy(cublas_);
-    if (stream_) (void)cudaStreamDestroy(stream_);
+    if (owns_stream_ && stream_) (void)cudaStreamDestroy(stream_);
  }

  // Non-copyable, non-movable (owns library handles).
  GpuContext(const GpuContext&) = delete;
  GpuContext& operator=(const GpuContext&) = delete;

-  /** Lazily-created thread-local default context. */
+  /** Get the thread-local default context.
+   * If setThreadLocal() has been called, returns that context.
+   * Otherwise lazily creates a new context with a dedicated stream. */
  static GpuContext& threadLocal() {
+    GpuContext* override = tl_override_ptr();
+    if (override) return *override;
    thread_local GpuContext ctx;
    return ctx;
  }

+  /** Override the thread-local default context for this thread.
+   * The caller retains ownership of \p ctx — it must outlive all uses.
+   * Pass nullptr to restore the lazily-created default. */
+  static void setThreadLocal(GpuContext* ctx) { tl_override_ptr() = ctx; }
+
  cudaStream_t stream() const { return stream_; }
  cublasHandle_t cublasHandle() const { return cublas_; }
  cusolverDnHandle_t cusolverHandle() const { return cusolver_; }

+  /** cuBLASLt handle (lazy-initialized on first GEMM call). */
+  cublasLtHandle_t cublasLtHandle() const {
+    if (!cublas_lt_) {
+      EIGEN_CUBLAS_CHECK(cublasLtCreate(&cublas_lt_));
+    }
+    return cublas_lt_;
+  }
+
+  /** Workspace buffer for cublasLtMatmul (grown lazily by cublaslt_gemm).
+   * Not thread-safe — all GEMM calls must be on this context's stream. */
+  internal::DeviceBuffer* gemmWorkspace() const { return &gemm_workspace_; }
+
+  /** cuSPARSE handle (lazy-initialized on first call). */
+  cusparseHandle_t cusparseHandle() const {
+    if (!cusparse_) {
+      cusparseStatus_t s1 = cusparseCreate(&cusparse_);
+      eigen_assert(s1 == CUSPARSE_STATUS_SUCCESS && "cusparseCreate failed");
+      EIGEN_UNUSED_VARIABLE(s1);
+      cusparseStatus_t s2 = cusparseSetStream(cusparse_, stream_);
+      eigen_assert(s2 == CUSPARSE_STATUS_SUCCESS && "cusparseSetStream failed");
+      EIGEN_UNUSED_VARIABLE(s2);
+    }
+    return cusparse_;
+  }
+
 private:
  cudaStream_t stream_ = nullptr;
  cublasHandle_t cublas_ = nullptr;
  cusolverDnHandle_t cusolver_ = nullptr;
+  mutable cublasLtHandle_t cublas_lt_ = nullptr;   // lazy
+  mutable cusparseHandle_t cusparse_ = nullptr;    // lazy
+  mutable internal::DeviceBuffer gemm_workspace_;  // lazy
+  bool owns_stream_ = true;
+
+  static GpuContext*& tl_override_ptr() {
+    thread_local GpuContext* ptr = nullptr;
+    return ptr;
+  }
+
+  void init_handles() {
+    EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
+    EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
+  }
 };

 }  // namespace Eigen
--- a/Eigen/src/GPU/GpuSVD.h
+++ b/Eigen/src/GPU/GpuSVD.h
@@ -18,6 +18,7 @@
 //   GpuSVD<double> svd(A, ComputeThinU | ComputeThinV);
 //   VectorXd S = svd.singularValues();
 //   MatrixXd U = svd.matrixU();       // m×k or m×m
+//   MatrixXd V = svd.matrixV();         // n×k or n×n (matches JacobiSVD)
 //   MatrixXd VT = svd.matrixVT();      // k×n or n×n (this is V^T)
 //   MatrixXd X = svd.solve(B);        // pseudoinverse
 //   MatrixXd X = svd.solve(B, k);     // truncated (top k triplets)
@@ -53,6 +54,7 @@ class GpuSVD {

  ~GpuSVD() {
    if (handle_) (void)cusolverDnDestroy(handle_);
+    if (cublas_lt_) (void)cublasLtDestroy(cublas_lt_);
    if (cublas_) (void)cublasDestroy(cublas_);
    if (stream_) (void)cudaStreamDestroy(stream_);
  }
@@ -185,6 +187,10 @@ class GpuSVD {
    }
  }

+  /** Right singular vectors V. Returns n_orig × k or n_orig × n_orig.
+   * Equivalent to matrixVT().adjoint(). Matches Eigen's JacobiSVD::matrixV() API. */
+  PlainMatrix matrixV() const { return matrixVT().adjoint(); }
+
  /** Right singular vectors transposed V^T. Returns k × n_orig or n_orig × n_orig.
   * For transposed case, VT comes from cuSOLVER's U. */
  PlainMatrix matrixVT() const {
@@ -254,6 +260,8 @@ class GpuSVD {
  cudaStream_t stream_ = nullptr;
  cusolverDnHandle_t handle_ = nullptr;
  cublasHandle_t cublas_ = nullptr;
+  cublasLtHandle_t cublas_lt_ = nullptr;
+  mutable internal::DeviceBuffer gemm_workspace_;
  internal::CusolverParams params_;
  internal::DeviceBuffer d_A_;        // working copy of A (overwritten by gesvd)
  internal::DeviceBuffer d_U_;        // left singular vectors
@@ -277,6 +285,7 @@ class GpuSVD {
    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
    EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
    EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
+    EIGEN_CUBLASLT_CHECK(cublasLtCreate(&cublas_lt_));
    ensure_scratch(0);
  }

@@ -410,23 +419,21 @@ class GpuSVD {
    internal::DeviceBuffer d_tmp(static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar));
    {
      Scalar alpha_one(1), beta_zero(0);
-      constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
-      constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;

      if (!transposed_) {
        // U_stored^H * B: (m_×kk)^H × (m_×nrhs) → kk×nrhs.
-        EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs),
-                                        static_cast<int>(m_), &alpha_one, d_U_.ptr, dtype, static_cast<int>(m_),
-                                        d_B.ptr, dtype, static_cast<int>(m_orig), &beta_zero, d_tmp.ptr, dtype,
-                                        static_cast<int>(kk), compute, internal::cuda_gemm_algo()));
+        internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_C, CUBLAS_OP_N, kk, nrhs, m_, &alpha_one,
+                                        static_cast<const Scalar*>(d_U_.ptr), m_, static_cast<const Scalar*>(d_B.ptr),
+                                        m_orig, &beta_zero, static_cast<Scalar*>(d_tmp.ptr), kk, &gemm_workspace_,
+                                        stream_);
      } else {
        // VT_stored * B: VT_stored is vtrows×n_ = kk×m_orig (thin), NoTrans.
        // vtrows×m_orig times m_orig×nrhs → vtrows×nrhs. Use first kk rows.
        const Index vtrows_stored = (swap_uv_options(options_) & ComputeFullV) ? n_ : k;
-        EIGEN_CUBLAS_CHECK(cublasGemmEx(
-            cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(kk), static_cast<int>(nrhs), static_cast<int>(m_orig),
-            &alpha_one, d_VT_.ptr, dtype, static_cast<int>(vtrows_stored), d_B.ptr, dtype, static_cast<int>(m_orig),
-            &beta_zero, d_tmp.ptr, dtype, static_cast<int>(kk), compute, internal::cuda_gemm_algo()));
+        internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_N, CUBLAS_OP_N, kk, nrhs, m_orig, &alpha_one,
+                                        static_cast<const Scalar*>(d_VT_.ptr), vtrows_stored,
+                                        static_cast<const Scalar*>(d_B.ptr), m_orig, &beta_zero,
+                                        static_cast<Scalar*>(d_tmp.ptr), kk, &gemm_workspace_, stream_);
      }
    }

@@ -458,21 +465,19 @@ class GpuSVD {
    {
      internal::DeviceBuffer d_X(static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
      Scalar alpha_one(1), beta_zero(0);
-      constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
-      constexpr cublasComputeType_t compute = internal::cuda_compute_type<Scalar>::value;

      if (!transposed_) {
        const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
-        EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_C, CUBLAS_OP_N, static_cast<int>(n_orig),
-                                        static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_VT_.ptr, dtype,
-                                        static_cast<int>(vtrows), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero,
-                                        d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo()));
+        internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_C, CUBLAS_OP_N, n_orig, nrhs, kk, &alpha_one,
+                                        static_cast<const Scalar*>(d_VT_.ptr), vtrows,
+                                        static_cast<const Scalar*>(d_tmp.ptr), kk, &beta_zero,
+                                        static_cast<Scalar*>(d_X.ptr), n_orig, &gemm_workspace_, stream_);
      } else {
        // U_stored is m_×ucols. V_orig = U_stored[:,:kk]. NoTrans × tmp.
-        EIGEN_CUBLAS_CHECK(cublasGemmEx(cublas_, CUBLAS_OP_N, CUBLAS_OP_N, static_cast<int>(n_orig),
-                                        static_cast<int>(nrhs), static_cast<int>(kk), &alpha_one, d_U_.ptr, dtype,
-                                        static_cast<int>(m_), d_tmp.ptr, dtype, static_cast<int>(kk), &beta_zero,
-                                        d_X.ptr, dtype, static_cast<int>(n_orig), compute, internal::cuda_gemm_algo()));
+        internal::cublaslt_gemm<Scalar>(cublas_lt_, cublas_, CUBLAS_OP_N, CUBLAS_OP_N, n_orig, nrhs, kk, &alpha_one,
+                                        static_cast<const Scalar*>(d_U_.ptr), m_, static_cast<const Scalar*>(d_tmp.ptr),
+                                        kk, &beta_zero, static_cast<Scalar*>(d_X.ptr), n_orig, &gemm_workspace_,
+                                        stream_);
      }

      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_X.ptr,
--- a/Eigen/src/GPU/GpuSparseContext.h
+++ b/Eigen/src/GPU/GpuSparseContext.h
@@ -10,16 +10,25 @@
 // GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix
 // multiply (SpMM) via cuSPARSE.
 //
-// GpuSparseContext manages a cuSPARSE handle and device buffers. It accepts
-// Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the
-// GPU. RowMajor input is implicitly converted to ColMajor.
+// GpuSparseContext manages cuSPARSE descriptors and device buffers. It accepts
+// Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the GPU.
+// RowMajor input is implicitly converted to ColMajor.
+//
+// Can borrow a GpuContext for same-stream execution with BLAS-1 ops (zero
+// event overhead in iterative solvers like CG).
 //
 // Usage:
+//   // Standalone (own stream):
 //   GpuSparseContext<double> ctx;
-//   VectorXd y = ctx.multiply(A, x);           // y = A * x
-//   ctx.multiply(A, x, y, 2.0, 1.0);           // y = 2*A*x + y
-//   VectorXd z = ctx.multiplyT(A, x);          // z = A^T * x
-//   MatrixXd Y = ctx.multiplyMat(A, X);        // Y = A * X (multiple RHS)
+//   VectorXd y = ctx.multiply(A, x);
+//
+//   // Shared context (same stream as BLAS-1 ops):
+//   GpuContext gpu_ctx;
+//   GpuSparseContext<double> sparse_ctx(gpu_ctx);
+//   VectorXd y = sparse_ctx.multiply(A, x);
+//
+//   // Device-resident (no host roundtrip):
+//   sparse_ctx.multiply(A, d_x, d_y);  // DeviceMatrix in/out

 #ifndef EIGEN_GPU_SPARSE_CONTEXT_H
 #define EIGEN_GPU_SPARSE_CONTEXT_H
@@ -31,6 +40,57 @@

 namespace Eigen {

+// Forward declarations.
+template <typename Scalar_>
+class GpuSparseContext;
+template <typename Scalar_>
+class DeviceSparseView;
+
+/** SpMV expression: DeviceSparseView * DeviceMatrix → SpMVExpr.
+ * Evaluated by DeviceMatrix::operator=(SpMVExpr). */
+template <typename Scalar_>
+class SpMVExpr {
+ public:
+  using Scalar = Scalar_;
+  SpMVExpr(const DeviceSparseView<Scalar>& view, const DeviceMatrix<Scalar>& x) : view_(view), x_(x) {}
+  const DeviceSparseView<Scalar>& view() const { return view_; }
+  const DeviceMatrix<Scalar>& x() const { return x_; }
+
+ private:
+  const DeviceSparseView<Scalar>& view_;
+  const DeviceMatrix<Scalar>& x_;
+};
+
+/** Device-resident sparse matrix view. Returned by GpuSparseContext::deviceView().
+ * Lightweight handle referencing the context's cached device data.
+ *
+ * \warning One GpuSparseContext caches one sparse matrix at a time.
+ * Creating a second deviceView on the same context overwrites the first.
+ * For multiple simultaneous sparse matrices, use separate GpuSparseContext
+ * instances (they can share a GpuContext for same-stream execution).
+ *
+ * Supports `d_y = d_A * d_x` via SpMVExpr. */
+template <typename Scalar_>
+class DeviceSparseView {
+ public:
+  using Scalar = Scalar_;
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+
+  DeviceSparseView(GpuSparseContext<Scalar>& ctx, const SpMat& A) : ctx_(ctx), A_(A) {}
+
+  /** SpMV expression: d_A * d_x. Evaluated by DeviceMatrix::operator=. */
+  SpMVExpr<Scalar> operator*(const DeviceMatrix<Scalar>& x) const { return SpMVExpr<Scalar>(*this, x); }
+
+  Index rows() const { return A_.rows(); }
+  Index cols() const { return A_.cols(); }
+  const GpuSparseContext<Scalar>& context() const { return ctx_; }
+  const SpMat& matrix() const { return A_; }
+
+ private:
+  GpuSparseContext<Scalar>& ctx_;
+  const SpMat& A_;
+};
+
 template <typename Scalar_>
 class GpuSparseContext {
 public:
@@ -41,22 +101,47 @@ class GpuSparseContext {
  using DenseVector = Matrix<Scalar, Dynamic, 1>;
  using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;

-  GpuSparseContext() {
+  /** Standalone: creates own stream and cuSPARSE handle. */
+  GpuSparseContext() : owns_handle_(true) {
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    owns_stream_ = true;
    EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_));
    EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_));
  }

+  /** Borrow a GpuContext: shares stream and cuSPARSE handle.
+   * The GpuContext must outlive this GpuSparseContext. */
+  explicit GpuSparseContext(GpuContext& ctx)
+      : stream_(ctx.stream()), handle_(ctx.cusparseHandle()), owns_stream_(false), owns_handle_(false) {}
+
  ~GpuSparseContext() {
    destroy_descriptors();
-    if (handle_) (void)cusparseDestroy(handle_);
-    if (stream_) (void)cudaStreamDestroy(stream_);
+    if (owns_handle_ && handle_) (void)cusparseDestroy(handle_);
+    if (owns_stream_ && stream_) (void)cudaStreamDestroy(stream_);
  }

  GpuSparseContext(const GpuSparseContext&) = delete;
  GpuSparseContext& operator=(const GpuSparseContext&) = delete;

-  // ---- SpMV: y = A * x -----------------------------------------------------
+  // ---- Device sparse view (for expression syntax: d_y = d_A * d_x) ----------
+
+  /** Upload a sparse matrix to device and return a lightweight view.
+   * The sparse data is uploaded immediately and cached in this context.
+   * The returned view can be used for repeated SpMV without re-uploading.
+   * If the matrix values change, call deviceView() again to re-upload.
+   *
+   * \warning One context caches one matrix. Calling deviceView() again
+   * overwrites the previous upload. For multiple simultaneous matrices,
+   * use separate GpuSparseContext instances sharing the same GpuContext.
+   *
+   * Supports `d_y = d_A * d_x` expression syntax. */
+  DeviceSparseView<Scalar> deviceView(const SpMat& A) {
+    eigen_assert(A.isCompressed());
+    upload_sparse(A);
+    return DeviceSparseView<Scalar>(*this, A);
+  }
+
+  // ---- SpMV: y = A * x (host vectors) --------------------------------------

  /** Compute y = A * x. Returns y as a new dense vector. */
  template <typename InputType, typename Rhs>
@@ -64,34 +149,52 @@ class GpuSparseContext {
    const SpMat mat(A.derived());
    DenseVector y(mat.rows());
    y.setZero();
-    multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
+    multiply_host_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
    return y;
  }

-  /** Compute y = alpha * op(A) * x + beta * y (in-place). */
+  /** Compute y = alpha * op(A) * x + beta * y (in-place, host vectors). */
  template <typename InputType, typename Rhs, typename Dest>
  void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y,
                Scalar alpha = Scalar(1), Scalar beta = Scalar(0),
                cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
    const SpMat mat(A.derived());
-    multiply_impl(mat, x.derived(), y.derived(), alpha, beta, op);
+    multiply_host_impl(mat, x.derived(), y.derived(), alpha, beta, op);
  }

-  // ---- SpMV transpose: y = A^T * x -----------------------------------------
+  // ---- SpMV: y = A * x (DeviceMatrix, no host roundtrip) -------------------

-  /** Compute y = A^T * x. Returns y as a new dense vector. */
+  /** Compute d_y = A * d_x. Device-resident, no host transfer.
+   * Sparse matrix A is uploaded to device (cached). Dense vectors stay on device. */
+  template <typename InputType>
+  void multiply(const SparseMatrixBase<InputType>& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y) {
+    const SpMat mat(A.derived());
+    multiply_device_impl(mat, d_x, d_y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
+  }
+
+  /** Compute d_y = alpha * op(A) * d_x + beta * d_y (DeviceMatrix, in-place). */
+  template <typename InputType>
+  void multiply(const SparseMatrixBase<InputType>& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y,
+                Scalar alpha, Scalar beta, cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) {
+    const SpMat mat(A.derived());
+    multiply_device_impl(mat, d_x, d_y, alpha, beta, op);
+  }
+
+  // ---- SpMV transpose -------------------------------------------------------
+
+  /** Compute y = A^T * x (host vectors). */
  template <typename InputType, typename Rhs>
  DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
    const SpMat mat(A.derived());
    DenseVector y(mat.cols());
    y.setZero();
-    multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
+    multiply_host_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
    return y;
  }

-  // ---- SpMM: Y = A * X (multiple RHS) --------------------------------------
+  // ---- SpMM: Y = A * X (host, multiple RHS) --------------------------------

-  /** Compute Y = A * X where X is a dense matrix (multiple RHS). Returns Y. */
+  /** Compute Y = A * X where X is a dense matrix. Returns Y. */
  template <typename InputType, typename Rhs>
  DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X) {
    const SpMat mat(A.derived());
@@ -114,32 +217,37 @@ class GpuSparseContext {
 private:
  cudaStream_t stream_ = nullptr;
  cusparseHandle_t handle_ = nullptr;
+  bool owns_stream_ = false;
+  bool owns_handle_ = false;

-  // Cached device buffers (grow-only).
+  // Cached device buffers for sparse matrix (grow-only).
  internal::DeviceBuffer d_outerPtr_;
  internal::DeviceBuffer d_innerIdx_;
  internal::DeviceBuffer d_values_;
-  internal::DeviceBuffer d_x_;
-  internal::DeviceBuffer d_y_;
-  internal::DeviceBuffer d_workspace_;
  size_t d_outerPtr_size_ = 0;
  size_t d_innerIdx_size_ = 0;
  size_t d_values_size_ = 0;
+
+  // Cached device buffers for host-API dense vectors (grow-only).
+  internal::DeviceBuffer d_x_;
+  internal::DeviceBuffer d_y_;
  size_t d_x_size_ = 0;
  size_t d_y_size_ = 0;
-  size_t d_workspace_size_ = 0;

-  // Cached cuSPARSE descriptors.
+  mutable internal::DeviceBuffer d_workspace_;
+  mutable size_t d_workspace_size_ = 0;
+
+  // Cached cuSPARSE sparse matrix descriptor.
  cusparseSpMatDescr_t spmat_desc_ = nullptr;
  Index cached_rows_ = -1;
  Index cached_cols_ = -1;
  Index cached_nnz_ = -1;

-  // ---- SpMV implementation --------------------------------------------------
+  // ---- SpMV with host vectors (upload/download per call) --------------------

  template <typename RhsDerived, typename DestDerived>
-  void multiply_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
-                     cusparseOperation_t op) {
+  void multiply_host_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
+                          cusparseOperation_t op) {
    eigen_assert(A.isCompressed());

    const Index m = A.rows();
@@ -159,16 +267,13 @@ class GpuSparseContext {
      return;
    }

-    // Upload sparse matrix to device.
    upload_sparse(A);

-    // Upload x to device.
    ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar));
    const DenseVector x_tmp(x);
    EIGEN_CUDA_RUNTIME_CHECK(
        cudaMemcpyAsync(d_x_.ptr, x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));

-    // Upload y to device (for beta != 0).
    ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar));
    if (beta != Scalar(0)) {
      const DenseVector y_tmp(y);
@@ -176,27 +281,80 @@ class GpuSparseContext {
          cudaMemcpyAsync(d_y_.ptr, y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
    }

-    // Create dense vector descriptors.
+    exec_spmv(x_size, y_size, d_x_.ptr, d_y_.ptr, alpha, beta, op);
+
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(y.data(), d_y_.ptr, y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+  }
+
+  // ---- SpMV with DeviceMatrix (no host transfer) ----------------------------
+
+  // Called by public multiply(A, d_x, d_y) — always re-uploads A.
+  void multiply_device_impl(const SpMat& A, const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y, Scalar alpha,
+                            Scalar beta, cusparseOperation_t op) {
+    upload_sparse(A);
+    spmv_device_exec(d_x, d_y, alpha, beta, op);
+  }
+
+ public:
+  /** Execute SpMV using the already-uploaded sparse matrix (no re-upload).
+   * Used by SpMVExpr (d_y = d_A * d_x) for cached deviceView() paths.
+   * The sparse matrix must have been uploaded via deviceView() or multiply(). */
+  void spmv_device_exec(const DeviceMatrix<Scalar>& d_x, DeviceMatrix<Scalar>& d_y, Scalar alpha = Scalar(1),
+                        Scalar beta = Scalar(0), cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE) const {
+    eigen_assert(spmat_desc_ && "sparse matrix not uploaded — call deviceView() or multiply() first");
+    // cuSPARSE SpMV: y must not alias x (undefined behavior).
+    eigen_assert(d_x.data() != d_y.data() && "SpMV: output aliases input vector");
+
+    const Index m = cached_rows_;
+    const Index n = cached_cols_;
+    const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
+    const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
+
+    eigen_assert(d_x.rows() * d_x.cols() == x_size);
+
+    if (m == 0 || n == 0 || cached_nnz_ == 0) {
+      d_y.resize(y_size, 1);
+      if (beta == Scalar(0)) {
+        d_y.setZero();
+      }
+      return;
+    }
+
+    // Ensure d_y is allocated.
+    if (d_y.rows() * d_y.cols() != y_size) {
+      d_y.resize(y_size, 1);
+    }
+
+    // Wait for input data to be ready on this stream.
+    d_x.waitReady(stream_);
+    d_y.waitReady(stream_);
+
+    exec_spmv(x_size, y_size, const_cast<void*>(static_cast<const void*>(d_x.data())), static_cast<void*>(d_y.data()),
+              alpha, beta, op);
+
+    d_y.recordReady(stream_);
+  }
+
+ private:
+  // ---- Shared SpMV execution ------------------------------------------------
+
+  void exec_spmv(Index x_size, Index y_size, void* d_x_ptr, void* d_y_ptr, Scalar alpha, Scalar beta,
+                 cusparseOperation_t op) const {
    constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
    cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr;
-    EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_.ptr, dtype));
-    EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_.ptr, dtype));
+    EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_ptr, dtype));
+    EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_ptr, dtype));

-    // Query workspace size.
    size_t ws_size = 0;
    EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
                                                 CUSPARSE_SPMV_ALG_DEFAULT, &ws_size));
    ensure_buffer(d_workspace_, d_workspace_size_, ws_size);

-    // Execute SpMV.
    EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
                                      CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.ptr));

-    // Download result.
-    EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(y.data(), d_y_.ptr, y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
-    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
-
    (void)cusparseDestroyDnVec(x_desc);
    (void)cusparseDestroyDnVec(y_desc);
  }
@@ -222,7 +380,6 @@ class GpuSparseContext {

    upload_sparse(A);

-    // Upload X to device.
    const size_t x_bytes = static_cast<size_t>(k) * static_cast<size_t>(n) * sizeof(Scalar);
    const size_t y_bytes = static_cast<size_t>(m) * static_cast<size_t>(n) * sizeof(Scalar);
    ensure_buffer(d_x_, d_x_size_, x_bytes);
@@ -232,24 +389,19 @@ class GpuSparseContext {
      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.ptr, Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_));
    }

-    // Create dense matrix descriptors.
    constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
    cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr;
-    // Eigen is column-major, so ld = rows.
    EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k, n, k, d_x_.ptr, dtype, CUSPARSE_ORDER_COL));
    EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m, n, m, d_y_.ptr, dtype, CUSPARSE_ORDER_COL));

-    // Query workspace.
    size_t ws_size = 0;
    EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_,
                                                 x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size));
    ensure_buffer(d_workspace_, d_workspace_size_, ws_size);

-    // Execute SpMM.
    EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta,
                                      y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.ptr));

-    // Download result.
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.ptr, y_bytes, cudaMemcpyDeviceToHost, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));

@@ -278,21 +430,18 @@ class GpuSparseContext {
        cudaMemcpyAsync(d_innerIdx_.ptr, A.innerIndexPtr(), inner_bytes, cudaMemcpyHostToDevice, stream_));
    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.ptr, A.valuePtr(), val_bytes, cudaMemcpyHostToDevice, stream_));

-    // Recreate descriptor if shape changed.
    if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) {
      destroy_descriptors();

      constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
      constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value;

-      // ColMajor → CSC. outerIndexPtr = col offsets, innerIndexPtr = row indices.
      EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr,
                                             idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO, val_type));
      cached_rows_ = m;
      cached_cols_ = n;
      cached_nnz_ = nnz;
    } else {
-      // Same shape — just update pointers.
      EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.ptr, d_innerIdx_.ptr, d_values_.ptr));
    }
  }
@@ -307,7 +456,7 @@ class GpuSparseContext {
    cached_nnz_ = -1;
  }

-  void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) {
+  void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) const {
    if (needed > current_size) {
      if (buf.ptr) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
      buf = internal::DeviceBuffer(needed);
@@ -316,6 +465,17 @@ class GpuSparseContext {
  }
 };

+// ---- DeviceMatrix::operator=(SpMVExpr) out-of-line definition ----------------
+// Defined here because it needs the full GpuSparseContext definition.
+
+template <typename Scalar_>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const SpMVExpr<Scalar_>& expr) {
+  // Use spmv_device_exec — the sparse matrix was already uploaded by deviceView().
+  // No re-upload on repeated SpMV with the same view.
+  expr.view().context().spmv_device_exec(expr.x(), *this, Scalar_(1), Scalar_(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
+  return *this;
+}
+
 }  // namespace Eigen

 #endif  // EIGEN_GPU_SPARSE_CONTEXT_H
--- a/Eigen/src/GPU/GpuSupport.h
+++ b/Eigen/src/GPU/GpuSupport.h
@@ -21,6 +21,7 @@
 #include "./InternalHeaderCheck.h"

 #include <cuda_runtime.h>
+#include <vector>

 namespace Eigen {
 namespace internal {
@@ -36,26 +37,99 @@ namespace internal {

 // ---- RAII: device buffer ----------------------------------------------------

+// Thread-local pool of small device buffers to avoid cudaMalloc/cudaFree
+// overhead for tiny allocations (e.g., DeviceScalar). Buffers up to
+// kSmallBufferThreshold bytes are recycled; larger allocations bypass the pool.
+template <size_t SmallBufferThreshold = 256, size_t MaxPoolSize = 64>
+struct DeviceBufferPool {
+  static constexpr size_t kSmallBufferThreshold = SmallBufferThreshold;
+  static constexpr size_t kMaxPoolSize = MaxPoolSize;
+
+  struct Entry {
+    void* ptr;
+    size_t bytes;
+  };
+
+  ~DeviceBufferPool() {
+    for (auto& e : free_list_) (void)cudaFree(e.ptr);
+  }
+
+  void* allocate(size_t bytes) {
+    // Search for a buffer of sufficient size.
+    for (size_t i = 0; i < free_list_.size(); ++i) {
+      if (free_list_[i].bytes >= bytes) {
+        void* p = free_list_[i].ptr;
+        free_list_[i] = free_list_.back();
+        free_list_.pop_back();
+        return p;
+      }
+    }
+    // No suitable buffer found — allocate new.
+    void* p = nullptr;
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
+    return p;
+  }
+
+  void deallocate(void* p, size_t bytes) {
+    if (free_list_.size() < kMaxPoolSize) {
+      free_list_.push_back({p, bytes});
+    } else {
+      (void)cudaFree(p);
+    }
+  }
+
+  static DeviceBufferPool& threadLocal() {
+    thread_local DeviceBufferPool pool;
+    return pool;
+  }
+
+ private:
+  std::vector<Entry> free_list_;
+};
+
 struct DeviceBuffer {
  void* ptr = nullptr;

  DeviceBuffer() = default;

-  explicit DeviceBuffer(size_t bytes) {
-    if (bytes > 0) EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes));
+  explicit DeviceBuffer(size_t bytes) : size_(bytes) {
+    if (bytes > 0) {
+      if (bytes <= DeviceBufferPool<>::kSmallBufferThreshold) {
+        ptr = DeviceBufferPool<>::threadLocal().allocate(bytes);
+      } else {
+        EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&ptr, bytes));
+      }
+    }
  }

  ~DeviceBuffer() {
-    if (ptr) (void)cudaFree(ptr);  // destructor: ignore errors
+    if (ptr) {
+      if (size_ <= DeviceBufferPool<>::kSmallBufferThreshold) {
+        DeviceBufferPool<>::threadLocal().deallocate(ptr, size_);
+      } else {
+        (void)cudaFree(ptr);
+      }
+    }
  }

  // Move-only.
-  DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr) { o.ptr = nullptr; }
+  DeviceBuffer(DeviceBuffer&& o) noexcept : ptr(o.ptr), size_(o.size_) {
+    o.ptr = nullptr;
+    o.size_ = 0;
+  }
  DeviceBuffer& operator=(DeviceBuffer&& o) noexcept {
    if (this != &o) {
-      if (ptr) (void)cudaFree(ptr);
+      if (ptr) {
+        if (size_ <= DeviceBufferPool<>::kSmallBufferThreshold) {
+          DeviceBufferPool<>::threadLocal().deallocate(ptr, size_);
+        } else {
+          (void)cudaFree(ptr);
+        }
+      }
      ptr = o.ptr;
+      size_ = o.size_;
      o.ptr = nullptr;
+      o.size_ = 0;
    }
    return *this;
  }
@@ -63,12 +137,19 @@ struct DeviceBuffer {
  DeviceBuffer(const DeviceBuffer&) = delete;
  DeviceBuffer& operator=(const DeviceBuffer&) = delete;

+  size_t size() const { return size_; }
+
  // Adopt an existing device pointer. Caller relinquishes ownership.
+  // Adopted buffers bypass the pool on destruction.
  static DeviceBuffer adopt(void* p) {
    DeviceBuffer b;
    b.ptr = p;
+    b.size_ = DeviceBufferPool<>::kSmallBufferThreshold + 1;  // force cudaFree
    return b;
  }
+
+ private:
+  size_t size_ = 0;
 };

 // ---- Scalar → cudaDataType_t ------------------------------------------------
--- a/Eigen/src/GPU/README.md
+++ b/Eigen/src/GPU/README.md
@@ -62,6 +62,23 @@ The GPU version reads like CPU Eigen with explicit upload/download for dense
 operations, and an almost identical API for sparse solvers. Unsupported
 expressions are compile errors.

+**Standalone module.** `Eigen/GPU` does not modify or depend on Eigen's Core
+expression template system (`MatrixBase`, `CwiseBinaryOp`, etc.).
+`DeviceMatrix` is not an Eigen expression type and does not inherit from
+`MatrixBase`. The expression layer is a thin compile-time dispatch where every
+supported expression maps to a single NVIDIA library call. There is no
+coefficient-level evaluation, lazy fusion, or packet operations.
+
+**Interoperability where useful.** `DeviceMatrix` provides the same operator
+signatures as `Matrix` for common vector operations: `+=`, `-=`, `*=`,
+`dot()`, `squaredNorm()`, `norm()`, `setZero()`, and `noalias()`. This makes
+`DeviceMatrix` usable as a drop-in `VectorType` in Eigen algorithm templates
+that rely on these operations. For example, Eigen's `conjugate_gradient()`
+template works with `DeviceMatrix` with a single typedef change -- no
+modifications to the algorithm or the expression template system. Conjugate
+gradient is just the motivating example; we are open to expanding operator
+coverage as needed to support other high-level Eigen algorithms on the GPU.
+
 **Explicit over implicit.** Host-device transfers, stream management, and
 library handle lifetimes are visible in the API. There are no hidden
 allocations or synchronizations except where documented (e.g., `toHost()` must
@@ -96,6 +113,27 @@ MatrixXd C = transfer.get();
 `selfadjointView<UpLo>()`, `llt()`, `lu()`. These return lightweight
 expression objects that are evaluated when assigned.

+For BLAS Level-1 operations, `DeviceMatrix` also provides `dot()`, `norm()`,
+`squaredNorm()`, `setZero()`, `noalias()`, and arithmetic operators
+(`+=`, `-=`, `*=`) that dispatch to cuBLAS `axpy`, `nrm2`, `dot`, and
+`geam`. These are the operations needed by iterative solvers.
+
+### `DeviceScalar<Scalar>`
+
+A device-resident scalar value. Reductions like `dot()`, `norm()`, and
+`squaredNorm()` return `DeviceScalar` instead of a host scalar, deferring
+the host synchronization until the value is actually needed:
+
+```cpp
+auto dot_val = d_x.dot(d_y);          // DeviceScalar -- no sync
+auto norm_sq = d_r.squaredNorm();      // DeviceScalar -- no sync
+Scalar alpha = dot_val / norm_sq;      // sync here (implicit conversion)
+d_x += alpha * d_p;                    // host scalar * DeviceMatrix (axpy)
+```
+
+Division between `DeviceScalar` values (real types only) is performed on
+device via NPP, avoiding extra synchronizations.
+
 ### `GpuContext`

 Every GPU operation needs a CUDA stream and library handles (cuBLAS,
@@ -118,6 +156,12 @@ d_C1.device(ctx1) = d_A1 * d_B1;   // runs on stream 1
 d_C2.device(ctx2) = d_A2 * d_B2;   // runs on stream 2 (concurrently)
 ```

+To integrate with existing CUDA code, borrow an existing stream:
+
+```cpp
+GpuContext ctx(my_existing_stream);  // wraps stream, does not take ownership
+```
+
 ## Usage

 ### Matrix operations (cuBLAS)
@@ -133,7 +177,7 @@ d_C = d_A * d_B.transpose();

 // Scaled and accumulated
 d_C += 2.0 * d_A * d_B;             // alpha=2, beta=1
-d_C.device(ctx) -= d_A * d_B;       // alpha=-1, beta=1 (requires explicit context)
+d_C.device(ctx) -= d_A * d_B;       // alpha=-1, beta=1 (GEMM requires explicit context for -=)

 // Triangular solve (TRSM)
 d_X = d_A.triangularView<Lower>().solve(d_B);
@@ -145,6 +189,30 @@ d_C = d_A.selfadjointView<Lower>() * d_B;
 d_C.selfadjointView<Lower>().rankUpdate(d_A);  // C += A * A^H
 ```

+### BLAS Level-1 operations
+
+```cpp
+// Dot product and norms (return DeviceScalar -- no sync until read)
+auto dot_val = d_x.dot(d_y);          // cublasDdot / cublasCdotc
+auto norm_val = d_r.norm();            // cublasDnrm2
+double n = norm_val;                   // implicit conversion triggers sync
+
+// Vector arithmetic (cuBLAS axpy / geam)
+d_x += alpha * d_p;                    // axpy: x = x + alpha * p
+d_x -= alpha * d_p;                    // axpy: x = x - alpha * p
+d_x *= alpha;                          // scal: x = alpha * x
+d_r.setZero();                         // cudaMemsetAsync
+
+// DeviceScalar arithmetic (stays on device, real types only)
+auto alpha = absNew / dot_val;         // device-side division via NPP
+d_x += alpha * d_p;                    // DeviceScalar * DeviceMatrix (axpy with device pointer)
+
+// Matrix add/subtract (cuBLAS geam)
+DeviceMatrix<double> d_C = d_A + d_B;           // C = A + B
+d_C = d_A + 2.0 * d_B;                          // C = A + 2*B
+d_C = d_A - d_B;                                 // C = A - B
+```
+
 ### Dense solvers (cuSOLVER)

 **One-shot expression syntax** -- Convenient, re-factorizes each time:
@@ -183,6 +251,7 @@ GpuSVD<double> svd;
 svd.compute(d_A, ComputeThinU | ComputeThinV);
 VectorXd S = svd.singularValues();   // downloads to host
 MatrixXd U = svd.matrixU();          // downloads to host
+MatrixXd V = svd.matrixV();          // V (matches JacobiSVD)
 MatrixXd VT = svd.matrixVT();        // V^T (matches cuSOLVER)

 // Self-adjoint eigenvalue decomposition (results downloaded on access)
@@ -253,21 +322,60 @@ MatrixXcf C = fft.inv2d(B);         // 2D inverse (scaled by 1/(rows*cols))
 SparseMatrix<double> A = ...;
 VectorXd x = ...;

-GpuSparseContext<double> ctx;
-VectorXd y = ctx.multiply(A, x);            // y = A * x
-VectorXd z = ctx.multiplyT(A, x);           // z = A^T * x
-ctx.multiply(A, x, y, 2.0, 1.0);            // y = 2*A*x + y
+// Host vectors (upload/download handled internally)
+GpuSparseContext<double> spmv;
+VectorXd y = spmv.multiply(A, x);           // y = A * x
+VectorXd z = spmv.multiplyT(A, x);          // z = A^T * x
+spmv.multiply(A, x, y, 2.0, 1.0);           // y = 2*A*x + y
+MatrixXd Y = spmv.multiplyMat(A, X);        // Y = A * X (SpMM)

-// Multiple RHS (SpMM)
-MatrixXd Y = ctx.multiplyMat(A, X);         // Y = A * X
+// Device-resident SpMV (sparse matrix cached on device)
+GpuSparseContext<double> spmv(ctx);          // share GpuContext for same-stream
+auto d_A = spmv.deviceView(A);              // upload sparse matrix once
+d_y = d_A * d_x;                            // operator syntax, stays on device
+```
+
+### Eigen algorithm interop (example: Conjugate gradient)
+
+The BLAS-1 operators and `DeviceSparseView` make `DeviceMatrix` usable as a
+vector type in GPU implementations of algorithms like conjugate gradient.
+Conjugate gradient is the motivating example -- a GPU CG implementation
+uses the same operations as the CPU version:
+
+```cpp
+GpuContext ctx;
+GpuSparseContext<double> spmv(ctx);
+auto d_A = spmv.deviceView(A);              // sparse matrix on device
+auto d_b = DeviceMatrix<double>::fromHost(b);
+auto d_x = DeviceMatrix<double>::fromHost(x0);
+
+// CG iteration using DeviceMatrix operators
+DeviceMatrix<double> d_r = d_b;             // r = b (deep copy via geam)
+DeviceMatrix<double> d_p(n), d_tmp(n);
+d_tmp = d_A * d_x;                          // SpMV (device-resident)
+d_r -= d_tmp;                               // axpy
+d_p = d_r.clone();
+RealScalar absNew = d_r.squaredNorm();       // DeviceScalar -> implicit sync
+
+for (int i = 0; i < maxIters && absNew > tol * tol; ++i) {
+  d_tmp = d_A * d_p;                         // SpMV
+  auto alpha = absNew / d_p.dot(d_tmp);      // host / DeviceScalar -> DeviceScalar
+  d_x += alpha * d_p;                        // axpy with DeviceScalar
+  d_r -= alpha * d_tmp;                      // axpy with DeviceScalar
+  RealScalar absOld = absNew;
+  absNew = d_r.squaredNorm();                // DeviceScalar -> implicit sync
+  d_p *= Scalar(absNew / absOld);            // scal (host scalars)
+  d_p += d_r;                                // axpy
+}
+MatrixXd x = d_x.toHost();
 ```

 ### Precision control

-GEMM dispatch enables tensor core algorithms by default, allowing cuBLAS to
-choose the fastest algorithm for the given precision and architecture. For
-double precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64
-results computed faster via tensor cores.
+GEMM dispatch uses `cublasLtMatmul` with heuristic algorithm selection,
+enabling cuBLAS to choose tensor core algorithms when beneficial. For double
+precision on sm_80+ (Ampere), this allows Ozaki emulation -- full FP64 results
+computed faster via tensor cores.

 | Macro | Effect |
 |---|---|
@@ -290,6 +398,7 @@ Mandatory sync points:
 - `fromHost()` -- Synchronizes to complete the upload before returning
 - `toHost()` / `HostTransfer::get()` -- Must deliver data to host
 - `info()` -- Must read the factorization status
+- `DeviceScalar` implicit conversion -- Downloads scalar from device

 **Cross-stream safety** is automatic. `DeviceMatrix` tracks write completion
 via CUDA events. When a matrix written on stream A is read on stream B, the
@@ -307,21 +416,33 @@ noted otherwise).

 | DeviceMatrix expression | Library call | Parameters |
 |---|---|---|
-| `C = A * B` | `cublasGemmEx` | transA=N, transB=N, alpha=1, beta=0 |
-| `C = A.adjoint() * B` | `cublasGemmEx` | transA=C, transB=N |
-| `C = A.transpose() * B` | `cublasGemmEx` | transA=T, transB=N |
-| `C = A * B.adjoint()` | `cublasGemmEx` | transA=N, transB=C |
-| `C = A * B.transpose()` | `cublasGemmEx` | transA=N, transB=T |
-| `C = alpha * A * B` | `cublasGemmEx` | alpha from LHS |
-| `C = A * (alpha * B)` | `cublasGemmEx` | alpha from RHS |
-| `C += A * B` | `cublasGemmEx` | alpha=1, beta=1 |
-| `C.device(ctx) -= A * B` | `cublasGemmEx` | alpha=-1, beta=1 |
+| `C = A * B` | `cublasLtMatmul` | transA=N, transB=N, alpha=1, beta=0 |
+| `C = A.adjoint() * B` | `cublasLtMatmul` | transA=C, transB=N |
+| `C = A.transpose() * B` | `cublasLtMatmul` | transA=T, transB=N |
+| `C = A * B.adjoint()` | `cublasLtMatmul` | transA=N, transB=C |
+| `C = A * B.transpose()` | `cublasLtMatmul` | transA=N, transB=T |
+| `C = alpha * A * B` | `cublasLtMatmul` | alpha from LHS |
+| `C = A * (alpha * B)` | `cublasLtMatmul` | alpha from RHS |
+| `C += A * B` | `cublasLtMatmul` | alpha=1, beta=1 |
+| `C.device(ctx) -= A * B` | `cublasLtMatmul` | alpha=-1, beta=1 |
 | `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs |
 | `X = A.llt<Upper>().solve(B)` | same | uplo=Upper |
 | `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs |
 | `X = A.triangularView<L>().solve(B)` | `cublasXtrsm` | side=L, uplo, diag=NonUnit |
 | `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
 | `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
+| `C = A + B` | `cublasXgeam` | alpha=1, beta=1 |
+| `C = A + alpha * B` | `cublasXgeam` | alpha=1, beta from scaled |
+| `C = A - B` | `cublasXgeam` | alpha=1, beta=-1 |
+| `C = A - alpha * B` | `cublasXgeam` | alpha=1, beta=-scaled |
+| `x += alpha * y` | `cublasXaxpy` | alpha (host scalar) |
+| `x += dAlpha * y` | `cublasXaxpy` | alpha (DeviceScalar, device pointer mode) |
+| `x -= alpha * y` | `cublasXaxpy` | alpha negated |
+| `x *= alpha` | `cublasXscal` | alpha (host or DeviceScalar) |
+| `x.dot(y)` | `cublasXdot` / `cublasXdotc` | returns `DeviceScalar` |
+| `x.norm()` | `cublasXnrm2` | returns `DeviceScalar<RealScalar>` |
+| `x.squaredNorm()` | `cublasXdot(x, x)` | returns `DeviceScalar<RealScalar>` |
+| `d_y = view * d_x` | `cusparseSpMV` | device-resident SpMV |

 ### `DeviceMatrix<Scalar>`

@@ -332,11 +453,12 @@ one column.
 ```cpp
 // Construction
 DeviceMatrix<Scalar>()                                   // Empty (0x0)
+DeviceMatrix<Scalar>(Index n)                            // Allocate column vector (n x 1)
 DeviceMatrix<Scalar>(rows, cols)                         // Allocate uninitialized

 // Upload / download
 static DeviceMatrix fromHost(matrix, stream=nullptr)           // -> DeviceMatrix (syncs)
-static DeviceMatrix fromHostAsync(ptr, rows, cols, outerStride, s)  // -> DeviceMatrix (no sync, caller manages ptr lifetime)
+static DeviceMatrix fromHostAsync(ptr, rows, cols, stream)         // -> DeviceMatrix (no sync, caller manages ptr lifetime)
 PlainMatrix        toHost(stream=nullptr)                      // -> host Matrix (syncs)
 HostTransfer       toHostAsync(stream=nullptr)                 // -> HostTransfer future (no sync)
 DeviceMatrix       clone(stream=nullptr)                       // -> DeviceMatrix (D2D copy, async)
@@ -357,6 +479,50 @@ LuExpr             lu()                                  // -> .solve(d_B) -> De
 TriangularView     triangularView<UpLo>()                // -> .solve(d_B) -> DeviceMatrix (TRSM)
 SelfAdjointView    selfadjointView<UpLo>()               // -> * d_B (SYMM), .rankUpdate(d_A) (SYRK)
 DeviceAssignment   device(GpuContext& ctx)                // Bind assignment to explicit stream
+DeviceMatrix&      noalias()                             // No-op (all ops are implicitly noalias)
+
+// BLAS Level-1 (all have overloads with explicit GpuContext& parameter)
+DeviceScalar<Scalar>     dot(const DeviceMatrix& other)  // cuBLAS dot/dotc -> DeviceScalar
+DeviceScalar<RealScalar> norm()                          // cuBLAS nrm2 -> DeviceScalar
+DeviceScalar<RealScalar>  squaredNorm()                    // dot(self, self) -> DeviceScalar (no sync)
+void                     setZero()                       // cudaMemsetAsync
+void                     addScaled(GpuContext&, Scalar alpha, const DeviceMatrix& x)  // this += alpha * x (axpy)
+void                     scale(GpuContext&, Scalar alpha)                              // this *= alpha (scal)
+void                     copyFrom(GpuContext&, const DeviceMatrix& other)              // this = other (D2D copy)
+DeviceMatrix& operator+=(Scalar * DeviceMatrix)          // cuBLAS axpy
+DeviceMatrix& operator-=(Scalar * DeviceMatrix)          // cuBLAS axpy (negated)
+DeviceMatrix& operator+=(const DeviceMatrix&)            // cuBLAS axpy
+DeviceMatrix& operator-=(const DeviceMatrix&)            // cuBLAS axpy
+DeviceMatrix& operator+=(const DeviceScaledDevice&)      // cuBLAS axpy (DeviceScalar * DeviceMatrix)
+DeviceMatrix& operator-=(const DeviceScaledDevice&)      // cuBLAS axpy (DeviceScalar * DeviceMatrix, negated)
+DeviceMatrix& operator*=(Scalar)                         // cuBLAS scal
+DeviceMatrix& operator*=(const DeviceScalar<Scalar>&)    // cuBLAS scal (device pointer)
+DeviceMatrix  cwiseProduct(GpuContext&, const DeviceMatrix&)            // NPP nppsMul (float/double only)
+void          cwiseProduct(GpuContext&, const DeviceMatrix&, const DeviceMatrix&)  // in-place: this = a .* b
+
+// geam expressions (evaluated on assignment)
+DeviceMatrix& operator=(const DeviceAddExpr&)            // C = A + B, C = A + alpha*B, C = A - B, etc.
+```
+
+### `DeviceScalar<Scalar>`
+
+Device-resident scalar. Returned by `dot()`, `norm()`, and `squaredNorm()`.
+Implicit conversion to `Scalar` triggers `cudaStreamSynchronize` + download.
+
+```cpp
+DeviceScalar(cudaStream_t stream = nullptr)              // Allocate uninitialized
+DeviceScalar(Scalar host_val, cudaStream_t stream)       // Upload host value
+
+Scalar         get()                                     // Download (syncs stream)
+               operator Scalar()                         // Implicit conversion (syncs)
+Scalar*        devicePtr()                               // Raw device pointer
+cudaStream_t   stream()
+
+// Device-side arithmetic (no host sync, real types only)
+DeviceScalar   operator/(DeviceScalar, DeviceScalar)     // NPP nppsDiv
+DeviceScalar   operator/(Scalar, DeviceScalar)           // upload + div
+DeviceScalar   operator/(DeviceScalar, Scalar)           // upload + div
+DeviceScalar   operator-()                               // NPP nppsMulC(-1)
 ```

 ### `GpuContext`
@@ -365,11 +531,15 @@ Unified GPU execution context owning a CUDA stream and library handles.

 ```cpp
 GpuContext()                                             // Creates dedicated stream + handles
+GpuContext(cudaStream_t stream)                          // Borrow existing stream (not owned)
 static GpuContext& threadLocal()                         // Per-thread default (lazy-created)
+static void        setThreadLocal(GpuContext* ctx)       // Override thread-local default (nullptr restores)

 cudaStream_t       stream()
 cublasHandle_t     cublasHandle()
 cusolverDnHandle_t cusolverHandle()
+cublasLtHandle_t   cublasLtHandle()                     // Lazy-initialized
+cusparseHandle_t   cusparseHandle()                     // Lazy-initialized
 ```

 Non-copyable, non-movable (owns library handles).
@@ -440,6 +610,7 @@ GpuSVD&            compute(const DeviceMatrix& d_A, unsigned options = ComputeTh

 RealVector         singularValues()                      // -> host vector (syncs, downloads)
 PlainMatrix        matrixU()                             // -> host Matrix (syncs, downloads)
+PlainMatrix        matrixV()                             // -> host Matrix (V = VT^H, matches JacobiSVD)
 PlainMatrix        matrixVT()                            // -> host Matrix (syncs, downloads V^T)

 PlainMatrix        solve(const MatrixBase<D>& B)         // -> host Matrix (pseudoinverse)
@@ -452,9 +623,9 @@ Index              rows() / cols()
 cudaStream_t       stream()
 ```

-**Note:** `singularValues()`, `matrixU()`, and `matrixVT()` download to host
-on each call. Device-side accessors returning `DeviceMatrix` are planned but
-not yet implemented.
+**Note:** `singularValues()`, `matrixU()`, `matrixV()`, and `matrixVT()`
+download to host on each call. Device-side accessors returning `DeviceMatrix`
+are planned but not yet implemented.

 ### `GpuSelfAdjointEigenSolver<Scalar>` -- Eigendecomposition (cuSOLVER)

@@ -544,28 +715,47 @@ the input scalar type (complex vs real).

 ### `GpuSparseContext<Scalar>` -- SpMV/SpMM (cuSPARSE)

-Accepts `SparseMatrix<Scalar, ColMajor>`. All methods accept host data and
-return host data.
+Accepts `SparseMatrix<Scalar, ColMajor>`.

 ```cpp
 GpuSparseContext()                                       // Creates own stream + cuSPARSE handle
+GpuSparseContext(GpuContext& ctx)                        // Borrow GpuContext for same-stream execution

-DenseVector        multiply(A, x)                                       // y = A * x
-void               multiply(A, x, y, alpha=1, beta=0,                   // y = alpha*op(A)*x + beta*y
+// Host data in/out
+DenseVector        multiply(A, x)                        // y = A * x
+void               multiply(A, x, y, alpha=1, beta=0,   // y = alpha*op(A)*x + beta*y
                     op=CUSPARSE_OPERATION_NON_TRANSPOSE)
-DenseVector        multiplyT(A, x)                                      // y = A^T * x
-DenseMatrix        multiplyMat(A, X)                                    // Y = A * X (SpMM)
+DenseVector        multiplyT(A, x)                       // y = A^T * x
+DenseMatrix        multiplyMat(A, X)                     // Y = A * X (SpMM)
+
+// DeviceMatrix in/out (sparse matrix re-uploaded each call)
+void               multiply(A, d_x, d_y)                // SpMV with device vectors
+void               multiply(A, d_x, d_y, alpha, beta, op)
+
+// Device-resident sparse matrix (upload once, reuse)
+DeviceSparseView   deviceView(A)                         // Upload sparse matrix, return view

 cudaStream_t       stream()
 ```

+### `DeviceSparseView<Scalar>` -- Device-resident sparse matrix
+
+Returned by `GpuSparseContext::deviceView()`. Holds a sparse matrix on device
+for repeated SpMV without re-uploading.
+
+```cpp
+SpMVExpr           operator*(const DeviceMatrix& d_x)    // d_y = view * d_x (evaluated on assignment)
+```
+
 ### Aliasing

 Unlike Eigen's `Matrix`, where omitting `.noalias()` triggers a copy to a
 temporary, DeviceMatrix dispatches directly to NVIDIA library calls which have
 no built-in aliasing protection. All operations are implicitly noalias.
 The caller must ensure operands don't alias the destination for GEMM and TRSM
-(debug asserts catch violations).
+(debug asserts catch violations). `geam` expressions (`d_C = d_A + alpha * d_B`)
+are safe with aliasing. The `.noalias()` method exists as a no-op for Eigen
+template compatibility.

 ## File layout

@@ -573,12 +763,14 @@ The caller must ensure operands don't alias the destination for GEMM and TRSM
 |------|-----------|----------|
 | `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `cuda_data_type<>` |
 | `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` |
-| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM expression wrappers |
+| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM and geam expression wrappers |
 | `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers |
 | `DeviceSolverExpr.h` | `DeviceMatrix.h` | Solver expression wrappers (LLT, LU) |
+| `DeviceScalar.h` | `GpuSupport.h`, `DeviceScalarOps.h` | `DeviceScalar<>` (device-resident scalar) |
+| `DeviceScalarOps.h` | `<npps_*.h>` | Scalar div/neg/cwiseProduct via NPP |
 | `DeviceDispatch.h` | all above | All dispatch functions + `DeviceAssignment` |
 | `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `GpuContext` |
-| `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>` | cuBLAS error macro, op/compute type maps |
+| `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>`, `<cublasLt.h>` | cuBLAS/cuBLASLt error macro, type maps |
 | `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
 | `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
 | `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
@@ -588,7 +780,7 @@ The caller must ensure operands don't alias the destination for GEMM and TRSM
 | `CuFftSupport.h` | `GpuSupport.h`, `<cufft.h>` | cuFFT error macro, type-dispatch wrappers |
 | `GpuFFT.h` | `CuFftSupport.h`, `CuBlasSupport.h` | 1D/2D FFT with plan caching |
 | `CuSparseSupport.h` | `GpuSupport.h`, `<cusparse.h>` | cuSPARSE error macro |
-| `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE |
+| `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE, `DeviceSparseView` |
 | `CuDssSupport.h` | `GpuSupport.h`, `<cudss.h>` | cuDSS error macro, type traits (optional) |
 | `GpuSparseSolverBase.h` | `CuDssSupport.h` | CRTP base for sparse solvers (optional) |
 | `GpuSparseLLT.h` | `GpuSparseSolverBase.h` | Sparse Cholesky via cuDSS (optional) |
@@ -606,7 +798,7 @@ cmake -G Ninja -B build -S . \

 cmake --build build --target gpu_cublas gpu_cusolver_llt gpu_cusolver_lu \
  gpu_cusolver_qr gpu_cusolver_svd gpu_cusolver_eigen \
-  gpu_device_matrix gpu_cufft gpu_cusparse_spmv
+  gpu_device_matrix gpu_cufft gpu_cusparse_spmv gpu_cg
 ctest --test-dir build -R "gpu_" --output-on-failure

 # Sparse solvers (cuDSS -- separate install required)
@@ -627,10 +819,19 @@ ctest --test-dir build -R gpu_cudss --output-on-failure
  Device-side accessors returning `DeviceMatrix` views of the internal buffers
  would allow chaining GPU operations (e.g., `svd.deviceU() * d_A`) without
  round-tripping through host memory.
- **Device-resident sparse matrix-vector products.** `GpuSparseContext`
-  currently operates on host vectors and matrices, uploading and downloading
-  on each call. The key missing piece is a `DeviceSparseView` that holds a
-  sparse matrix on device and supports operator syntax (`d_y = d_A * d_x`)
-  with `DeviceMatrix` operands -- keeping the entire SpMV/SpMM pipeline on
-  device. This is essential for iterative solvers and any workflow that chains
-  sparse and dense operations without returning to the host.
+- **Batched API (`DeviceBatchMatrix`).** A strided batch of N identical-size
+  matrices dispatching to cuBLAS/cuSOLVER batched APIs (`cublasDgemmBatched`,
+  `cusolverDnXpotrfBatched`, etc.). This enables robotics and model-predictive
+  control workloads where many small independent systems are solved in
+  parallel.
+- **cuTENSOR for Tensor module.** Replace the hand-written GPU tensor
+  contraction and reduction kernels (~2300 lines in
+  `TensorContractionGpu.h` / `TensorReductionGpu.h`) with cuTENSOR dispatch,
+  following the same library-dispatch pattern used by `Eigen/GPU`.
+- **Unified/zero-copy memory for Jetson.** Use `cudaMallocManaged` or
+  `cudaHostAllocMapped` to eliminate `fromHost()` / `toHost()` copies on
+  integrated GPUs (Jetson) where CPU and GPU share DRAM.
+- **Device-side Eigen interop.** Bridge between host-side `DeviceMatrix`
+  dispatch and device-side Eigen expression templates (Core + Tensor) running
+  inside CUDA kernels. Raw-pointer + `Map` / `TensorMap` as the zero-copy
+  interop surface.
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -31,7 +31,10 @@ EIGEN_DONT_INLINE void conjugate_gradient(const MatrixType& mat, const Rhs& rhs,
                                          Index& iters, typename Dest::RealScalar& tol_error) {
  typedef typename Dest::RealScalar RealScalar;
  typedef typename Dest::Scalar Scalar;
-  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  // Use Dest's plain (owning) type as VectorType. For CPU Matrix/Map this
+  // resolves to Matrix<Scalar,Dynamic,1>. For GPU DeviceMatrix, PlainObject
+  // is DeviceMatrix itself (already owning).
+  typedef typename Dest::PlainObject VectorType;

  RealScalar tol = tol_error;
  Index maxIters = iters;
--- a/benchmarks/GPU/CMakeLists.txt
+++ b/benchmarks/GPU/CMakeLists.txt
@@ -11,7 +11,7 @@
 #   ncu --set full -o profile ./build-bench-gpu/bench_gpu_solvers --benchmark_filter=BM_GpuLLT_Compute/4096

 cmake_minimum_required(VERSION 3.18)
-project(EigenGpuBenchmarks CXX)
+project(EigenGpuBenchmarks CXX CUDA)

 find_package(benchmark REQUIRED)
 find_package(CUDAToolkit REQUIRED)
@@ -55,3 +55,37 @@ eigen_add_gpu_benchmark(bench_gpu_batching_float bench_gpu_batching.cpp DEFINITI
 # FFT benchmarks: 1D/2D C2C, R2C, C2R throughput and plan reuse.
 eigen_add_gpu_benchmark(bench_gpu_fft bench_gpu_fft.cpp LIBRARIES CUDA::cufft)
 eigen_add_gpu_benchmark(bench_gpu_fft_double bench_gpu_fft.cpp LIBRARIES CUDA::cufft DEFINITIONS SCALAR=double)
+
+# CG sync overhead benchmark: host vs device pointer mode for reductions.
+# Uses CUDA kernels for device scalar arithmetic.
+add_executable(bench_gpu_cg_sync bench_gpu_cg_sync.cu)
+target_include_directories(bench_gpu_cg_sync PRIVATE
+  ${EIGEN_SOURCE_DIR}
+  ${CUDAToolkit_INCLUDE_DIRS})
+target_link_libraries(bench_gpu_cg_sync PRIVATE
+  benchmark::benchmark benchmark::benchmark_main
+  CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
+target_compile_options(bench_gpu_cg_sync PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
+target_compile_definitions(bench_gpu_cg_sync PRIVATE EIGEN_USE_GPU)
+
+# GPU CG vs CPU CG comparison benchmark.
+add_executable(bench_gpu_cg_vs_cpu bench_gpu_cg_vs_cpu.cu)
+target_include_directories(bench_gpu_cg_vs_cpu PRIVATE
+  ${EIGEN_SOURCE_DIR}
+  ${CUDAToolkit_INCLUDE_DIRS})
+target_link_libraries(bench_gpu_cg_vs_cpu PRIVATE
+  benchmark::benchmark benchmark::benchmark_main
+  CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
+target_compile_options(bench_gpu_cg_vs_cpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
+target_compile_definitions(bench_gpu_cg_vs_cpu PRIVATE EIGEN_USE_GPU)
+
+# Bundle Adjustment benchmark: GPU CG vs CPU CG on real BAL datasets.
+add_executable(bench_gpu_ba bench_gpu_ba.cu)
+target_include_directories(bench_gpu_ba PRIVATE
+  ${EIGEN_SOURCE_DIR}
+  ${CUDAToolkit_INCLUDE_DIRS})
+target_link_libraries(bench_gpu_ba PRIVATE
+  benchmark::benchmark
+  CUDA::cudart CUDA::cusolver CUDA::cublas CUDA::cusparse CUDA::npps CUDA::nppc)
+target_compile_options(bench_gpu_ba PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3 --expt-relaxed-constexpr>)
+target_compile_definitions(bench_gpu_ba PRIVATE EIGEN_USE_GPU)
--- a/benchmarks/GPU/ba_results.md
+++ b/benchmarks/GPU/ba_results.md
@@ -0,0 +1,149 @@
+# Bundle Adjustment: GPU CG vs CPU CG Results
+
+Benchmark of Eigen's GPU CG pipeline on normal equations arising from bundle
+adjustment (BAL datasets). Compares CPU `ConjugateGradient` (Jacobi preconditioner)
+against GPU CG using `DeviceMatrix` + `GpuSparseContext` + `DeviceScalar`.
+
+## Hardware
+
+- **CPU**: Intel Core i7-13700HX (Raptor Lake, 12 cores / 24 threads, single thread for Eigen CG)
+- **GPU**: NVIDIA GeForce RTX 4070 Laptop GPU (Ada Lovelace, 4608 CUDA cores, 8 GB GDDR6)
+- **CUDA**: 13.2 / Driver 595.79
+- **OS**: Ubuntu 24.04 (WSL2, kernel 6.6.87)
+
+## Software
+
+- Eigen: `eigen-gpu-cg` branch
+- Google Benchmark 1.9.1
+- Compiler: nvcc 13.2 + g++ 13.3
+- Normal equations: H = J^T*J + I (Levenberg-Marquardt damping lambda=1.0)
+- CG tolerance: 1e-8, max iterations: 10000
+
+## Method
+
+For each BAL problem file:
+1. Parse the BAL file (cameras, 3D points, 2D observations)
+2. Compute the full Jacobian J using the BAL camera model (Rodrigues rotation +
+   perspective projection + radial distortion) with central finite differences
+3. Form the normal equations H = J^T*J + lambda*I (sparse, symmetric positive definite)
+4. Solve H*dx = -J^T*r using CG with Jacobi preconditioner on CPU and GPU
+5. Report wall-clock time (mean of 3 repetitions)
+
+GPU CG uses: `GpuSparseContext` for SpMV, `DeviceMatrix` for vectors,
+`DeviceScalar` with `CUBLAS_POINTER_MODE_DEVICE` for dot/norm reductions,
+in-place `cwiseProduct` via NPP for Jacobi preconditioner application,
+device-pointer-mode `scal` to avoid host sync on the beta update.
+
+## Results
+
+### Summary table
+
+| Dataset | Cameras | Points | Obs | H size | H nnz | CG iters | CPU CG (ms) | GPU CG (ms) | Speedup |
+|---------|---------|--------|-----|--------|-------|----------|-------------|-------------|---------|
+| Ladybug-49 | 49 | 7,776 | 31,843 | 23,769 | 1.8M | 4,421 | 4,006 | 1,152 | **3.5x** |
+| Ladybug-138 | 138 | 19,878 | 85,217 | 60,876 | 4.8M | 7,008 | 21,498 | 3,553 | **6.1x** |
+| Ladybug-646 | 646 | 73,584 | 327,297 | 226,566 | 18.4M | 10,000* | 123,727 | 14,268 | **8.7x** |
+| Dubrovnik-356 | 356 | 226,730 | 1,255,268 | 683,394 | 69.8M | 4,308 | 216,149 | 24,493 | **8.8x** |
+
+\* Hit 10,000 iteration cap (poorly conditioned problem). Both CPU and GPU
+hit the same cap, so timing comparison remains valid.
+
+### Profile breakdown (Ladybug-138, nsys)
+
+GPU kernel time is dominated by SpMV (91%). The remaining 9% is BLAS-1
+operations (dot, axpy, scal) and NPP element-wise ops (cwiseProduct).
+
+| Kernel | Time (ms) | % | Calls |
+|--------|-----------|---|-------|
+| cuSPARSE csrmv (SpMV) | 2507 | 91.3% | 7,006 |
+| cuBLAS dot | 92 | 3.4% | 21,020 |
+| cuBLAS axpy (device ptr) | 27 | 1.0% | 14,012 |
+| cuSPARSE partition | 19 | 0.7% | 7,006 |
+| NPP cwiseProduct | 16 + 13 | 1.1% | 14,011 + 7,006 |
+| cuBLAS axpy (host ptr) | 12 | 0.5% | 7,005 |
+| cuBLAS scal (device ptr) | 11 | 0.4% | 7,005 |
+| NPP scalar ops | 7 | 0.2% | 7,006 |
+
+### Optimizations applied
+
+Three profiling-driven optimizations reduced GPU CG time by **1.8x**
+(6.5s → 3.6s on Ladybug-138):
+
+1. **In-place `cwiseProduct`**: The Jacobi preconditioner apply
+   (`z = invdiag .* residual`) was allocating a new DeviceMatrix every
+   iteration. Added `z.cwiseProduct(ctx, a, b)` that reuses `z`'s buffer.
+   Reduced `cudaMalloc` calls from 7,053 to 23 (saving 2.3s).
+
+2. **`squaredNorm` via `dot(x,x)`**: cuBLAS `nrm2` uses a numerically
+   careful scaled-sum-of-squares algorithm (29µs/call). Replaced with
+   `dot(x,x)` (6.4µs/call) — 4.5x faster per call, saving ~320ms.
+
+3. **Device-pointer `scal`**: `p *= beta` was converting `DeviceScalar`
+   beta to host (triggering a stream sync), then calling host-pointer-mode
+   scal. Added `operator*=(DeviceScalar)` that uses device-pointer-mode
+   scal, eliminating one sync per iteration. Halved `cudaStreamSynchronize`
+   calls from 14K to 7K.
+
+### Observations
+
+1. **GPU speedup scales with problem size**: from 3.5x on small problems
+   (24K variables) to 8.8x on large problems (683K variables). This is
+   expected — larger problems have more parallelism for the GPU to exploit.
+
+2. **Iteration counts match**: CPU and GPU CG converge in the same number
+   of iterations (within 1%), confirming numerical equivalence.
+
+3. **Bottleneck is SpMV**: CG iteration time is dominated (91%) by the
+   sparse matrix-vector product on H. Further speedup requires either
+   faster SpMV (e.g., block-sparse formats) or algorithmic improvements
+   (Schur complement, better preconditioners).
+
+4. **Remaining overhead**: CUDA API calls (cudaMemcpyAsync for 8-byte
+   DeviceScalar transfers) account for ~50% of non-kernel time. Batching
+   multiple scalar reductions into a single transfer would help.
+
+5. **Jacobi preconditioner is weak for BA**: The Ladybug-646 problem does
+   not converge in 10K iterations. Ceres uses block Jacobi or Schur
+   complement preconditioners that would also benefit from GPU acceleration.
+
+### Scaling plot data
+
+```
+# n        nnz_H       cpu_ms    gpu_ms    speedup
+23769      1793475     4006      1152      3.48
+60876      4791762     21498     3553      6.05
+226566     18387948    123727    14268     8.67
+683394     69827066    216149    24493     8.82
+```
+
+## BAL datasets
+
+Downloaded from http://grail.cs.washington.edu/projects/bal/
+
+| File | Source |
+|------|--------|
+| problem-49-7776-pre.txt | Ladybug sequence |
+| problem-138-19878-pre.txt | Ladybug sequence |
+| problem-646-73584-pre.txt | Ladybug sequence |
+| problem-356-226730-pre.txt | Dubrovnik reconstruction |
+
+## Reproducing
+
+```bash
+# Build
+cmake -G Ninja -B build-bench-gpu -S benchmarks/GPU -DCMAKE_CUDA_ARCHITECTURES=89
+cmake --build build-bench-gpu --target bench_gpu_ba
+
+# Download BAL datasets
+wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-49-7776-pre.txt.bz2
+wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-138-19878-pre.txt.bz2
+wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-646-73584-pre.txt.bz2
+wget http://grail.cs.washington.edu/projects/bal/data/dubrovnik/problem-356-226730-pre.txt.bz2
+bunzip2 *.bz2
+
+# Run (one at a time)
+BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
+BAL_FILE=problem-138-19878-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
+BAL_FILE=problem-646-73584-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
+BAL_FILE=problem-356-226730-pre.txt ./build-bench-gpu/bench_gpu_ba --benchmark_repetitions=3
+```
--- a/benchmarks/GPU/bench_gpu_ba.cu
+++ b/benchmarks/GPU/bench_gpu_ba.cu
@@ -0,0 +1,533 @@
+// Bundle Adjustment benchmark: GPU CG vs CPU CG on real BAL datasets.
+//
+// Tests Eigen's GPU CG pipeline (DeviceMatrix + GpuSparseContext + DeviceScalar)
+// on the normal equations (J^T*J) arising from bundle adjustment problems.
+//
+// Reads a BAL (Bundle Adjustment in the Large) format file, computes the
+// Jacobian and residual, forms the normal equations H = J^T*J + lambda*I,
+// then solves H*dx = -J^T*r with both CPU and GPU conjugate gradients.
+//
+// BAL format: http://grail.cs.washington.edu/projects/bal/
+//
+// Usage:
+//   cmake --build build-bench-gpu --target bench_gpu_ba
+//
+//   # Download a BAL dataset (bz2-compressed):
+//   wget http://grail.cs.washington.edu/projects/bal/data/ladybug/problem-49-7776-pre.txt.bz2
+//   bunzip2 problem-49-7776-pre.txt.bz2
+//
+//   # Run on a specific problem:
+//   BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba
+//
+//   # Append results to the log:
+//   BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba \
+//     --benchmark_format=console 2>&1 | tee -a benchmarks/GPU/ba_results.log
+
+#include <benchmark/benchmark.h>
+
+#include <Eigen/Sparse>
+#include <Eigen/IterativeLinearSolvers>
+#include <Eigen/GPU>
+
+#include <cmath>
+#include <cstdio>
+#include <fstream>
+#include <string>
+#include <vector>
+
+using namespace Eigen;
+
+// ============================================================================
+// BAL problem data
+// ============================================================================
+
+struct BALProblem {
+  int num_cameras = 0;
+  int num_points = 0;
+  int num_observations = 0;
+
+  // Observations: (camera_idx, point_idx, observed_x, observed_y).
+  std::vector<int> camera_index;
+  std::vector<int> point_index;
+  std::vector<double> observations_x;
+  std::vector<double> observations_y;
+
+  // Camera parameters: 9 per camera (Rodrigues r[3], translation t[3], f, k1, k2).
+  std::vector<double> cameras;  // [num_cameras * 9]
+
+  // 3D points: 3 per point.
+  std::vector<double> points;  // [num_points * 3]
+
+  const double* camera(int i) const { return &cameras[i * 9]; }
+  const double* point(int i) const { return &points[i * 3]; }
+
+  bool load(const std::string& filename) {
+    std::ifstream in(filename);
+    if (!in) {
+      fprintf(stderr, "ERROR: Cannot open BAL file: %s\n", filename.c_str());
+      return false;
+    }
+
+    in >> num_cameras >> num_points >> num_observations;
+    if (!in || num_cameras <= 0 || num_points <= 0 || num_observations <= 0) {
+      fprintf(stderr, "ERROR: Invalid BAL header in %s\n", filename.c_str());
+      return false;
+    }
+
+    camera_index.resize(num_observations);
+    point_index.resize(num_observations);
+    observations_x.resize(num_observations);
+    observations_y.resize(num_observations);
+    for (int i = 0; i < num_observations; ++i) {
+      in >> camera_index[i] >> point_index[i] >> observations_x[i] >> observations_y[i];
+    }
+
+    cameras.resize(num_cameras * 9);
+    for (int i = 0; i < num_cameras * 9; ++i) {
+      in >> cameras[i];
+    }
+
+    points.resize(num_points * 3);
+    for (int i = 0; i < num_points * 3; ++i) {
+      in >> points[i];
+    }
+
+    if (!in) {
+      fprintf(stderr, "ERROR: Truncated BAL file: %s\n", filename.c_str());
+      return false;
+    }
+
+    fprintf(stderr, "Loaded BAL: %d cameras, %d points, %d observations\n", num_cameras, num_points, num_observations);
+    return true;
+  }
+};
+
+// ============================================================================
+// Camera projection model (BAL convention)
+// ============================================================================
+
+// Rodrigues rotation: rotate point X by axis-angle vector omega.
+static void rodrigues_rotate(const double* omega, const double* X, double* result) {
+  double theta2 = omega[0] * omega[0] + omega[1] * omega[1] + omega[2] * omega[2];
+  if (theta2 > 1e-30) {
+    double theta = std::sqrt(theta2);
+    double costh = std::cos(theta);
+    double sinth = std::sin(theta);
+    double k = (1.0 - costh) / theta2;
+
+    // Cross product omega x X.
+    double wx = omega[1] * X[2] - omega[2] * X[1];
+    double wy = omega[2] * X[0] - omega[0] * X[2];
+    double wz = omega[0] * X[1] - omega[1] * X[0];
+
+    // Dot product omega . X.
+    double dot = omega[0] * X[0] + omega[1] * X[1] + omega[2] * X[2];
+
+    result[0] = X[0] * costh + wx * (sinth / theta) + omega[0] * dot * k;
+    result[1] = X[1] * costh + wy * (sinth / theta) + omega[1] * dot * k;
+    result[2] = X[2] * costh + wz * (sinth / theta) + omega[2] * dot * k;
+  } else {
+    // Small angle: R ≈ I + [omega]×.
+    result[0] = X[0] + omega[1] * X[2] - omega[2] * X[1];
+    result[1] = X[1] + omega[2] * X[0] - omega[0] * X[2];
+    result[2] = X[2] + omega[0] * X[1] - omega[1] * X[0];
+  }
+}
+
+// Project a 3D point through a camera, returning the 2D residual.
+// camera: [r0,r1,r2, t0,t1,t2, f, k1, k2]
+// point:  [X, Y, Z]
+// observed: [ox, oy]
+// residual: [rx, ry] = projected - observed
+static void project(const double* camera, const double* point, const double* observed, double* residual) {
+  // Rotate.
+  double P[3];
+  rodrigues_rotate(camera, point, P);
+
+  // Translate.
+  P[0] += camera[3];
+  P[1] += camera[4];
+  P[2] += camera[5];
+
+  // Normalize (BAL convention: negative z).
+  double xp = -P[0] / P[2];
+  double yp = -P[1] / P[2];
+
+  // Radial distortion.
+  double r2 = xp * xp + yp * yp;
+  double distortion = 1.0 + camera[7] * r2 + camera[8] * r2 * r2;
+
+  // Apply focal length.
+  double predicted_x = camera[6] * distortion * xp;
+  double predicted_y = camera[6] * distortion * yp;
+
+  residual[0] = predicted_x - observed[0];
+  residual[1] = predicted_y - observed[1];
+}
+
+// ============================================================================
+// Jacobian computation (numerical differentiation)
+// ============================================================================
+
+// Compute the 2x9 Jacobian block w.r.t. camera params and 2x3 block w.r.t.
+// point coords for a single observation, using central finite differences.
+static void compute_jacobian_block(const double* camera, const double* point, const double* observed,
+                                   double* J_cam,    // 2x9, row-major
+                                   double* J_point)  // 2x3, row-major
+{
+  constexpr double eps = 1e-8;
+
+  // Camera parameters (9).
+  double cam_pert[9];
+  std::copy(camera, camera + 9, cam_pert);
+  for (int j = 0; j < 9; ++j) {
+    double orig = cam_pert[j];
+    double rp[2], rm[2];
+
+    cam_pert[j] = orig + eps;
+    project(cam_pert, point, observed, rp);
+    cam_pert[j] = orig - eps;
+    project(cam_pert, point, observed, rm);
+    cam_pert[j] = orig;
+
+    J_cam[0 * 9 + j] = (rp[0] - rm[0]) / (2.0 * eps);
+    J_cam[1 * 9 + j] = (rp[1] - rm[1]) / (2.0 * eps);
+  }
+
+  // Point coordinates (3).
+  double pt_pert[3];
+  std::copy(point, point + 3, pt_pert);
+  for (int j = 0; j < 3; ++j) {
+    double orig = pt_pert[j];
+    double rp[2], rm[2];
+
+    pt_pert[j] = orig + eps;
+    project(camera, pt_pert, observed, rp);
+    pt_pert[j] = orig - eps;
+    project(camera, pt_pert, observed, rm);
+    pt_pert[j] = orig;
+
+    J_point[0 * 3 + j] = (rp[0] - rm[0]) / (2.0 * eps);
+    J_point[1 * 3 + j] = (rp[1] - rm[1]) / (2.0 * eps);
+  }
+}
+
+// ============================================================================
+// Build normal equations: H = J^T*J + lambda*I, g = -J^T*r
+// ============================================================================
+
+struct NormalEquations {
+  SparseMatrix<double, ColMajor, int> H;
+  VectorXd g;
+  VectorXd residual;
+  double residual_norm;
+  int jacobian_rows;
+  int jacobian_cols;
+  long jacobian_nnz;
+};
+
+static NormalEquations build_normal_equations(const BALProblem& problem, double lambda = 1.0) {
+  const int num_cam_params = problem.num_cameras * 9;
+  const int num_pt_params = problem.num_points * 3;
+  const int num_params = num_cam_params + num_pt_params;
+  const int num_residuals = problem.num_observations * 2;
+
+  fprintf(stderr, "Building Jacobian: %d x %d, %ld nonzeros\n", num_residuals, num_params,
+          (long)problem.num_observations * 24);
+
+  // Build J as a triplet list.
+  using Triplet = Eigen::Triplet<double>;
+  std::vector<Triplet> triplets;
+  triplets.reserve(problem.num_observations * 24);  // 2 rows × 12 nonzeros = 24 entries per obs
+
+  VectorXd residual(num_residuals);
+
+  for (int obs = 0; obs < problem.num_observations; ++obs) {
+    int ci = problem.camera_index[obs];
+    int pi = problem.point_index[obs];
+    double observed[2] = {problem.observations_x[obs], problem.observations_y[obs]};
+
+    // Compute residual.
+    double r[2];
+    project(problem.camera(ci), problem.point(pi), observed, r);
+    residual[obs * 2 + 0] = r[0];
+    residual[obs * 2 + 1] = r[1];
+
+    // Compute Jacobian blocks.
+    double J_cam[18], J_pt[6];  // 2x9 and 2x3
+    compute_jacobian_block(problem.camera(ci), problem.point(pi), observed, J_cam, J_pt);
+
+    // Insert camera block: rows [2*obs, 2*obs+1], cols [9*ci, 9*ci+8].
+    for (int row = 0; row < 2; ++row) {
+      for (int col = 0; col < 9; ++col) {
+        double val = J_cam[row * 9 + col];
+        if (val != 0.0) {
+          triplets.emplace_back(obs * 2 + row, ci * 9 + col, val);
+        }
+      }
+    }
+
+    // Insert point block: rows [2*obs, 2*obs+1], cols [num_cam_params + 3*pi, ...].
+    for (int row = 0; row < 2; ++row) {
+      for (int col = 0; col < 3; ++col) {
+        double val = J_pt[row * 3 + col];
+        if (val != 0.0) {
+          triplets.emplace_back(obs * 2 + row, num_cam_params + pi * 3 + col, val);
+        }
+      }
+    }
+  }
+
+  // Build sparse Jacobian.
+  SparseMatrix<double, ColMajor, int> J(num_residuals, num_params);
+  J.setFromTriplets(triplets.begin(), triplets.end());
+
+  fprintf(stderr, "Jacobian: %dx%d, nnz=%ld\n", (int)J.rows(), (int)J.cols(), (long)J.nonZeros());
+
+  // Form normal equations: H = J^T*J + lambda*I.
+  SparseMatrix<double, ColMajor, int> H = (J.transpose() * J).pruned();
+
+  // Add Levenberg-Marquardt damping.
+  for (int i = 0; i < num_params; ++i) {
+    H.coeffRef(i, i) += lambda;
+  }
+  H.makeCompressed();
+
+  // Gradient: g = -J^T * r.
+  VectorXd g = -(J.transpose() * residual);
+
+  double rnorm = residual.norm();
+  fprintf(stderr, "Normal equations: H is %dx%d, nnz=%ld, |r|=%.6e\n", (int)H.rows(), (int)H.cols(), (long)H.nonZeros(),
+          rnorm);
+
+  return {std::move(H), std::move(g), std::move(residual), rnorm, num_residuals, num_params, (long)J.nonZeros()};
+}
+
+// ============================================================================
+// Global problem state (loaded once before benchmarks run)
+// ============================================================================
+
+static BALProblem g_problem;
+static NormalEquations g_neq;
+static bool g_loaded = false;
+
+static void ensure_loaded() {
+  if (g_loaded) return;
+
+  const char* bal_file = std::getenv("BAL_FILE");
+  if (!bal_file) {
+    fprintf(stderr,
+            "ERROR: Set BAL_FILE environment variable to a BAL problem file.\n"
+            "  Download from: http://grail.cs.washington.edu/projects/bal/\n"
+            "  Example:\n"
+            "    wget http://grail.cs.washington.edu/projects/bal/data/ladybug/"
+            "problem-49-7776-pre.txt.bz2\n"
+            "    bunzip2 problem-49-7776-pre.txt.bz2\n"
+            "    BAL_FILE=problem-49-7776-pre.txt ./build-bench-gpu/bench_gpu_ba\n");
+    std::exit(1);
+  }
+
+  if (!g_problem.load(bal_file)) {
+    std::exit(1);
+  }
+
+  g_neq = build_normal_equations(g_problem);
+  g_loaded = true;
+}
+
+// ============================================================================
+// CPU CG benchmark
+// ============================================================================
+
+static void BM_BA_CPU_CG(benchmark::State& state) {
+  ensure_loaded();
+  const auto& H = g_neq.H;
+  const auto& g = g_neq.g;
+
+  ConjugateGradient<SparseMatrix<double, ColMajor, int>, Lower | Upper> cg;
+  cg.setMaxIterations(10000);
+  cg.setTolerance(1e-8);
+  cg.compute(H);
+
+  int last_iters = 0;
+  double last_error = 0;
+  for (auto _ : state) {
+    VectorXd dx = cg.solve(g);
+    benchmark::DoNotOptimize(dx.data());
+    last_iters = cg.iterations();
+    last_error = cg.error();
+  }
+
+  state.counters["n"] = H.rows();
+  state.counters["nnz"] = H.nonZeros();
+  state.counters["iters"] = last_iters;
+  state.counters["error"] = last_error;
+  state.counters["cameras"] = g_problem.num_cameras;
+  state.counters["points"] = g_problem.num_points;
+  state.counters["observations"] = g_problem.num_observations;
+}
+
+// ============================================================================
+// GPU CG benchmark (with Jacobi preconditioner)
+// ============================================================================
+
+static void cuda_warmup() {
+  static bool done = false;
+  if (!done) {
+    void* p;
+    cudaMalloc(&p, 1);
+    cudaFree(p);
+    done = true;
+  }
+}
+
+static void BM_BA_GPU_CG(benchmark::State& state) {
+  ensure_loaded();
+  cuda_warmup();
+
+  const auto& H = g_neq.H;
+  const auto& g = g_neq.g;
+  const Index n = H.rows();
+
+  // Extract inverse diagonal (Jacobi preconditioner).
+  using SpMat = SparseMatrix<double, ColMajor, int>;
+  VectorXd invdiag(n);
+  for (Index j = 0; j < H.outerSize(); ++j) {
+    SpMat::InnerIterator it(H, j);
+    while (it && it.index() != j) ++it;
+    if (it && it.index() == j && it.value() != 0.0)
+      invdiag(j) = 1.0 / it.value();
+    else
+      invdiag(j) = 1.0;
+  }
+
+  // Set up GPU context and upload data.
+  GpuContext ctx;
+  GpuContext::setThreadLocal(&ctx);
+  GpuSparseContext<double> spmv_ctx(ctx);
+  auto mat = spmv_ctx.deviceView(H);
+  auto d_invdiag = DeviceMatrix<double>::fromHost(invdiag, ctx.stream());
+  auto d_g = DeviceMatrix<double>::fromHost(g, ctx.stream());
+
+  int last_iters = 0;
+  double last_error = 0;
+
+  for (auto _ : state) {
+    DeviceMatrix<double> d_x(n, 1);
+    d_x.setZero(ctx);
+    DeviceMatrix<double> residual(n, 1);
+    residual.copyFrom(ctx, d_g);
+
+    double rhsNorm2 = d_g.squaredNorm(ctx);
+    double threshold = 1e-8 * 1e-8 * rhsNorm2;
+    double residualNorm2 = residual.squaredNorm(ctx);
+
+    DeviceMatrix<double> p = d_invdiag.cwiseProduct(ctx, residual);
+    DeviceMatrix<double> z(n, 1), tmp(n, 1);
+
+    auto absNew = residual.dot(ctx, p);
+    Index i = 0;
+    Index maxIters = 10000;
+    while (i < maxIters) {
+      tmp.noalias() = mat * p;
+      auto alpha = absNew / p.dot(ctx, tmp);
+      d_x += alpha * p;
+      residual -= alpha * tmp;
+
+      residualNorm2 = residual.squaredNorm(ctx);
+      if (residualNorm2 < threshold) break;
+
+      z.cwiseProduct(ctx, d_invdiag, residual);  // in-place, no allocation
+      auto absOld = std::move(absNew);
+      absNew = residual.dot(ctx, z);
+      auto beta = absNew / absOld;
+
+      p *= beta;  // device-pointer scal, no host sync
+      p += z;
+      i++;
+    }
+    benchmark::DoNotOptimize(d_x.data());
+    last_iters = i;
+    last_error = std::sqrt(residualNorm2 / rhsNorm2);
+  }
+
+  GpuContext::setThreadLocal(nullptr);
+
+  state.counters["n"] = n;
+  state.counters["nnz"] = H.nonZeros();
+  state.counters["iters"] = last_iters;
+  state.counters["error"] = last_error;
+  state.counters["cameras"] = g_problem.num_cameras;
+  state.counters["points"] = g_problem.num_points;
+  state.counters["observations"] = g_problem.num_observations;
+}
+
+// ============================================================================
+// CPU CG with Jacobi preconditioner (apples-to-apples comparison)
+// ============================================================================
+
+static void BM_BA_CPU_CG_Jacobi(benchmark::State& state) {
+  ensure_loaded();
+  const auto& H = g_neq.H;
+  const auto& g = g_neq.g;
+
+  // Eigen's DiagonalPreconditioner is effectively Jacobi.
+  ConjugateGradient<SparseMatrix<double, ColMajor, int>, Lower | Upper> cg;
+  cg.setMaxIterations(10000);
+  cg.setTolerance(1e-8);
+  cg.compute(H);
+
+  int last_iters = 0;
+  double last_error = 0;
+  for (auto _ : state) {
+    VectorXd dx = cg.solve(g);
+    benchmark::DoNotOptimize(dx.data());
+    last_iters = cg.iterations();
+    last_error = cg.error();
+  }
+
+  state.counters["n"] = H.rows();
+  state.counters["nnz"] = H.nonZeros();
+  state.counters["iters"] = last_iters;
+  state.counters["error"] = last_error;
+}
+
+// ============================================================================
+// Register benchmarks
+// ============================================================================
+
+BENCHMARK(BM_BA_CPU_CG)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_BA_CPU_CG_Jacobi)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_BA_GPU_CG)->Unit(benchmark::kMillisecond);
+
+// ============================================================================
+// Custom main: print summary after benchmarks
+// ============================================================================
+
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+
+  // Print problem info before benchmarks.
+  const char* bal_file = std::getenv("BAL_FILE");
+  if (bal_file) {
+    ensure_loaded();
+    fprintf(stderr,
+            "\n"
+            "=== Bundle Adjustment GPU CG Benchmark ===\n"
+            "BAL file:      %s\n"
+            "Cameras:       %d\n"
+            "Points:        %d\n"
+            "Observations:  %d\n"
+            "J size:        %d x %d, nnz=%ld\n"
+            "H size:        %d x %d, nnz=%ld\n"
+            "|residual|:    %.6e\n"
+            "==========================================\n\n",
+            bal_file, g_problem.num_cameras, g_problem.num_points, g_problem.num_observations, g_neq.jacobian_rows,
+            g_neq.jacobian_cols, g_neq.jacobian_nnz, (int)g_neq.H.rows(), (int)g_neq.H.cols(), (long)g_neq.H.nonZeros(),
+            g_neq.residual_norm);
+  }
+
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+  return 0;
+}
--- a/benchmarks/GPU/bench_gpu_cg_sync.cu
+++ b/benchmarks/GPU/bench_gpu_cg_sync.cu
@@ -0,0 +1,291 @@
+// Benchmark: GPU Conjugate Gradient via DeviceMatrix operators.
+//
+// Shows the path to running Eigen's CG on GPU with minimal code changes.
+// The DeviceMatrix benchmark mirrors Eigen's conjugate_gradient() line-by-line.
+// A raw cuBLAS device-pointer-mode implementation is included as a lower bound.
+//
+// The only change needed in Eigen's CG template to support DeviceMatrix:
+//   Line 34:  typedef Dest VectorType;  (instead of Matrix<Scalar, Dynamic, 1>)
+//
+// Usage:
+//   cmake --build build-bench-gpu --target bench_gpu_cg_sync
+//   ./build-bench-gpu/bench_gpu_cg_sync
+
+#include <benchmark/benchmark.h>
+
+#include <Eigen/Sparse>
+#include <Eigen/GPU>
+#include <cusparse.h>
+
+using namespace Eigen;
+
+using Scalar = double;
+using RealScalar = double;
+using Vec = Matrix<Scalar, Dynamic, 1>;
+using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+
+static SpMat make_spd(Index n) {
+  SpMat A(n, n);
+  A.reserve(VectorXi::Constant(n, 3));
+  for (Index i = 0; i < n; ++i) {
+    A.insert(i, i) = 4.0;
+    if (i > 0) A.insert(i, i - 1) = -1.0;
+    if (i < n - 1) A.insert(i, i + 1) = -1.0;
+  }
+  A.makeCompressed();
+  return A;
+}
+
+static void cuda_warmup() {
+  static bool done = false;
+  if (!done) {
+    void* p;
+    cudaMalloc(&p, 1);
+    cudaFree(p);
+    done = true;
+  }
+}
+
+// ==========================================================================
+// GPU CG using DeviceMatrix operators — mirrors Eigen's conjugate_gradient()
+// ==========================================================================
+//
+// Compare with Eigen/src/IterativeLinearSolvers/ConjugateGradient.h lines 29-84.
+// Left column: Eigen CG code.  Right column: this benchmark.
+//
+//   Eigen CG                              GPU CG (this benchmark)
+//   --------                              -----------------------
+//   VectorType residual = rhs - mat * x;  residual.copyFrom(ctx, rhs);  [x=0 so r=b]
+//   RealScalar rhsNorm2 = rhs.sqNorm();   RealScalar rhsNorm2 = rhs.squaredNorm();
+//   ...
+//   tmp.noalias() = mat * p;              tmp.noalias() = mat * p;  [identical]
+//   Scalar alpha = absNew / p.dot(tmp);   Scalar alpha = absNew / p.dot(tmp);  [identical]
+//   x += alpha * p;                       x += alpha * p;  [identical]
+//   residual -= alpha * tmp;              residual -= alpha * tmp;  [identical]
+//   residualNorm2 = residual.sqNorm();    residualNorm2 = residual.squaredNorm();  [identical]
+//   ...
+//   p = z + beta * p;                     p *= beta; p += z;  [equivalent, no alloc]
+
+static void BM_CG_DeviceMatrixOps(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+
+  SpMat A = make_spd(n);
+  Vec b = Vec::Random(n);
+
+  // One shared context: SpMV + BLAS-1 on same stream, zero event overhead.
+  GpuContext ctx;
+  GpuContext::setThreadLocal(&ctx);
+  GpuSparseContext<Scalar> spmv(ctx);
+  auto mat = spmv.deviceView(A);
+
+  // Upload RHS once.
+  auto rhs = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
+
+  for (auto _ : state) {
+    // --- Eigen CG lines 34-63: initialization ---
+    //   typedef Dest VectorType;                               // GPU CHANGE: was Matrix<Scalar,Dynamic,1>
+    //   VectorType residual = rhs - mat * x;                   // x=0, so residual = rhs
+    DeviceMatrix<Scalar> x(n, 1);
+    x.setZero();
+    DeviceMatrix<Scalar> residual(n, 1);
+    residual.copyFrom(ctx, rhs);
+
+    //   RealScalar rhsNorm2 = rhs.squaredNorm();
+    RealScalar rhsNorm2 = rhs.squaredNorm();
+    if (rhsNorm2 == 0) continue;
+
+    RealScalar tol = 1e-10;
+    const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+    RealScalar threshold = numext::maxi(RealScalar(tol * tol * rhsNorm2), considerAsZero);
+
+    //   RealScalar residualNorm2 = residual.squaredNorm();
+    RealScalar residualNorm2 = residual.squaredNorm();
+    if (residualNorm2 < threshold) continue;
+
+    //   VectorType p(n);
+    //   p = precond.solve(residual);                           // no preconditioner: p = residual
+    DeviceMatrix<Scalar> p(n, 1);
+    p.copyFrom(ctx, residual);
+
+    //   VectorType z(n), tmp(n);
+    DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
+
+    //   auto absNew = numext::real(residual.dot(p));
+    //   DeviceScalar — stays on device, no sync.
+    auto absNew = residual.dot(p);  // DeviceScalar, no sync
+
+    //   while (i < maxIters) {
+    Index maxIters = 200;
+    Index i = 0;
+    while (i < maxIters) {
+      //     tmp.noalias() = mat * p;
+      tmp.noalias() = mat * p;  // SpMV, device-resident
+
+      //     auto alpha = absNew / p.dot(tmp);
+      //   DeviceScalar / DeviceScalar → device kernel, no sync!
+      auto alpha = absNew / p.dot(tmp);  // DeviceScalar, no sync
+
+      //     x += alpha * p;
+      //   DeviceScalar * DeviceMatrix → device-pointer axpy, no sync!
+      x += alpha * p;
+
+      //     residual -= alpha * tmp;
+      residual -= alpha * tmp;  // device-pointer axpy, no sync
+
+      //     residualNorm2 = residual.squaredNorm();
+      residualNorm2 = residual.squaredNorm();  // THE one sync per iteration
+
+      //     if (residualNorm2 < threshold) break;
+      if (residualNorm2 < threshold) break;
+
+      //     z = precond.solve(residual);
+      z.copyFrom(ctx, residual);  // no preconditioner
+
+      //     auto absOld = std::move(absNew);
+      auto absOld = std::move(absNew);  // no sync, no alloc
+
+      //     absNew = numext::real(residual.dot(z));
+      absNew = residual.dot(z);  // DeviceScalar, no sync
+
+      //     auto beta = absNew / absOld;
+      //   DeviceScalar / DeviceScalar → device kernel, no sync!
+      auto beta = absNew / absOld;  // DeviceScalar, no sync
+
+      //     p = z + beta * p;
+      p *= beta;  // device-pointer scal, no host sync
+      p += z;
+
+      i++;
+    }
+  }
+
+  GpuContext::setThreadLocal(nullptr);
+  state.SetItemsProcessed(state.iterations() * 200);
+}
+
+BENCHMARK(BM_CG_DeviceMatrixOps)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
+
+// ==========================================================================
+// Raw cuBLAS device-pointer-mode CG (1 sync/iter) — performance lower bound
+// ==========================================================================
+
+__global__ void scalar_div_kernel(const Scalar* a, const Scalar* b, Scalar* out) { *out = *a / *b; }
+__global__ void scalar_neg_kernel(const Scalar* in, Scalar* out) { *out = -(*in); }
+
+static void BM_CG_DevicePointerMode(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int maxIters = 200;
+
+  SpMat A = make_spd(n);
+  Vec b = Vec::Random(n);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  cublasHandle_t cublas;
+  cublasCreate(&cublas);
+  cublasSetStream(cublas, stream);
+
+  cusparseHandle_t cusparse;
+  cusparseCreate(&cusparse);
+  cusparseSetStream(cusparse, stream);
+
+  internal::DeviceBuffer d_outer((n + 1) * sizeof(int));
+  internal::DeviceBuffer d_inner(A.nonZeros() * sizeof(int));
+  internal::DeviceBuffer d_vals(A.nonZeros() * sizeof(Scalar));
+  cudaMemcpy(d_outer.ptr, A.outerIndexPtr(), (n + 1) * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_inner.ptr, A.innerIndexPtr(), A.nonZeros() * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_vals.ptr, A.valuePtr(), A.nonZeros() * sizeof(Scalar), cudaMemcpyHostToDevice);
+
+  cusparseSpMatDescr_t matA;
+  cusparseCreateCsc(&matA, n, n, A.nonZeros(), d_outer.ptr, d_inner.ptr, d_vals.ptr, CUSPARSE_INDEX_32I,
+                    CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
+
+  internal::DeviceBuffer d_tmp_buf(n * sizeof(Scalar));
+  cusparseDnVecDescr_t tmp_x, tmp_y;
+  cusparseCreateDnVec(&tmp_x, n, d_tmp_buf.ptr, CUDA_R_64F);
+  cusparseCreateDnVec(&tmp_y, n, d_tmp_buf.ptr, CUDA_R_64F);
+  Scalar spmv_alpha = 1.0, spmv_beta = 0.0;
+  size_t ws_size = 0;
+  cusparseSpMV_bufferSize(cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &spmv_alpha, matA, tmp_x, &spmv_beta, tmp_y,
+                          CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &ws_size);
+  internal::DeviceBuffer d_workspace(ws_size);
+  cusparseDestroyDnVec(tmp_x);
+  cusparseDestroyDnVec(tmp_y);
+
+  internal::DeviceBuffer d_x(n * sizeof(Scalar)), d_r(n * sizeof(Scalar));
+  internal::DeviceBuffer d_p(n * sizeof(Scalar)), d_tmp(n * sizeof(Scalar));
+  internal::DeviceBuffer d_b(n * sizeof(Scalar));
+  internal::DeviceBuffer d_absNew(sizeof(Scalar)), d_absOld(sizeof(Scalar));
+  internal::DeviceBuffer d_pdot(sizeof(Scalar)), d_alpha(sizeof(Scalar));
+  internal::DeviceBuffer d_neg_alpha(sizeof(Scalar)), d_beta(sizeof(Scalar));
+  internal::DeviceBuffer d_rnorm(sizeof(RealScalar));
+
+  cudaMemcpy(d_b.ptr, b.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice);
+
+  auto spmv = [&](Scalar* x_ptr, Scalar* y_ptr) {
+    cusparseDnVecDescr_t vx, vy;
+    cusparseCreateDnVec(&vx, n, x_ptr, CUDA_R_64F);
+    cusparseCreateDnVec(&vy, n, y_ptr, CUDA_R_64F);
+    cusparseSpMV(cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &spmv_alpha, matA, vx, &spmv_beta, vy, CUDA_R_64F,
+                 CUSPARSE_SPMV_ALG_DEFAULT, d_workspace.ptr);
+    cusparseDestroyDnVec(vx);
+    cusparseDestroyDnVec(vy);
+  };
+
+  for (auto _ : state) {
+    cudaMemsetAsync(static_cast<Scalar*>(d_x.ptr), 0, n * sizeof(Scalar), stream);
+    cudaMemcpyAsync(d_r.ptr, d_b.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
+    cudaMemcpyAsync(d_p.ptr, d_b.ptr, n * sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
+
+    cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_DEVICE);
+    cublasDdot(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_p.ptr), 1,
+               static_cast<Scalar*>(d_absNew.ptr));
+
+    for (int i = 0; i < maxIters; ++i) {
+      spmv(static_cast<Scalar*>(d_p.ptr), static_cast<Scalar*>(d_tmp.ptr));
+
+      cublasDdot(cublas, n, static_cast<Scalar*>(d_p.ptr), 1, static_cast<Scalar*>(d_tmp.ptr), 1,
+                 static_cast<Scalar*>(d_pdot.ptr));
+
+      scalar_div_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_absNew.ptr), static_cast<Scalar*>(d_pdot.ptr),
+                                             static_cast<Scalar*>(d_alpha.ptr));
+      scalar_neg_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_alpha.ptr), static_cast<Scalar*>(d_neg_alpha.ptr));
+
+      cublasDaxpy(cublas, n, static_cast<Scalar*>(d_alpha.ptr), static_cast<Scalar*>(d_p.ptr), 1,
+                  static_cast<Scalar*>(d_x.ptr), 1);
+      cublasDaxpy(cublas, n, static_cast<Scalar*>(d_neg_alpha.ptr), static_cast<Scalar*>(d_tmp.ptr), 1,
+                  static_cast<Scalar*>(d_r.ptr), 1);
+
+      cublasDnrm2(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<RealScalar*>(d_rnorm.ptr));
+
+      RealScalar rnorm;
+      cudaMemcpyAsync(&rnorm, d_rnorm.ptr, sizeof(RealScalar), cudaMemcpyDeviceToHost, stream);
+      cudaStreamSynchronize(stream);
+      if (rnorm * rnorm < 1e-20) break;
+
+      cudaMemcpyAsync(d_absOld.ptr, d_absNew.ptr, sizeof(Scalar), cudaMemcpyDeviceToDevice, stream);
+      cublasDdot(cublas, n, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_r.ptr), 1,
+                 static_cast<Scalar*>(d_absNew.ptr));
+
+      scalar_div_kernel<<<1, 1, 0, stream>>>(static_cast<Scalar*>(d_absNew.ptr), static_cast<Scalar*>(d_absOld.ptr),
+                                             static_cast<Scalar*>(d_beta.ptr));
+
+      cublasDscal(cublas, n, static_cast<Scalar*>(d_beta.ptr), static_cast<Scalar*>(d_p.ptr), 1);
+      cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_HOST);
+      Scalar one = 1.0;
+      cublasDaxpy(cublas, n, &one, static_cast<Scalar*>(d_r.ptr), 1, static_cast<Scalar*>(d_p.ptr), 1);
+      cublasSetPointerMode(cublas, CUBLAS_POINTER_MODE_DEVICE);
+    }
+    cudaStreamSynchronize(stream);
+  }
+
+  state.SetItemsProcessed(state.iterations() * maxIters);
+  cusparseDestroySpMat(matA);
+  cusparseDestroy(cusparse);
+  cublasDestroy(cublas);
+  cudaStreamDestroy(stream);
+}
+
+BENCHMARK(BM_CG_DevicePointerMode)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
--- a/benchmarks/GPU/bench_gpu_cg_vs_cpu.cu
+++ b/benchmarks/GPU/bench_gpu_cg_vs_cpu.cu
@@ -0,0 +1,216 @@
+// Benchmark: GPU CG vs CPU CG on realistic sparse systems.
+//
+// Tests 2D Laplacian (5-point stencil) and 3D Laplacian (7-point stencil)
+// in both float and double precision.
+//
+// Usage:
+//   cmake --build build-bench-gpu --target bench_gpu_cg_vs_cpu
+//   ./build-bench-gpu/bench_gpu_cg_vs_cpu
+
+#include <benchmark/benchmark.h>
+
+#include <Eigen/Sparse>
+#include <Eigen/IterativeLinearSolvers>
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Sparse matrix generators -----------------------------------------------
+
+template <typename Scalar>
+SparseMatrix<Scalar, ColMajor, int> make_laplacian_2d(int grid_n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  const int n = grid_n * grid_n;
+  SpMat A(n, n);
+  A.reserve(VectorXi::Constant(n, 5));
+  for (int i = 0; i < grid_n; ++i) {
+    for (int j = 0; j < grid_n; ++j) {
+      int idx = i * grid_n + j;
+      A.insert(idx, idx) = Scalar(4);
+      if (i > 0) A.insert(idx, idx - grid_n) = Scalar(-1);
+      if (i < grid_n - 1) A.insert(idx, idx + grid_n) = Scalar(-1);
+      if (j > 0) A.insert(idx, idx - 1) = Scalar(-1);
+      if (j < grid_n - 1) A.insert(idx, idx + 1) = Scalar(-1);
+    }
+  }
+  A.makeCompressed();
+  return A;
+}
+
+template <typename Scalar>
+SparseMatrix<Scalar, ColMajor, int> make_laplacian_3d(int grid_n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  const int n = grid_n * grid_n * grid_n;
+  const int n2 = grid_n * grid_n;
+  SpMat A(n, n);
+  A.reserve(VectorXi::Constant(n, 7));
+  for (int i = 0; i < grid_n; ++i) {
+    for (int j = 0; j < grid_n; ++j) {
+      for (int k = 0; k < grid_n; ++k) {
+        int idx = i * n2 + j * grid_n + k;
+        A.insert(idx, idx) = Scalar(6);
+        if (i > 0) A.insert(idx, idx - n2) = Scalar(-1);
+        if (i < grid_n - 1) A.insert(idx, idx + n2) = Scalar(-1);
+        if (j > 0) A.insert(idx, idx - grid_n) = Scalar(-1);
+        if (j < grid_n - 1) A.insert(idx, idx + grid_n) = Scalar(-1);
+        if (k > 0) A.insert(idx, idx - 1) = Scalar(-1);
+        if (k < grid_n - 1) A.insert(idx, idx + 1) = Scalar(-1);
+      }
+    }
+  }
+  A.makeCompressed();
+  return A;
+}
+
+static void cuda_warmup() {
+  static bool done = false;
+  if (!done) {
+    void* p;
+    cudaMalloc(&p, 1);
+    cudaFree(p);
+    done = true;
+  }
+}
+
+// ---- CPU CG -----------------------------------------------------------------
+
+template <typename Scalar, typename MatGen>
+void run_cpu_cg(benchmark::State& state, MatGen make_matrix) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  const int grid_n = state.range(0);
+  SpMat A = make_matrix(grid_n);
+  Vec b = Vec::Random(A.rows());
+
+  ConjugateGradient<SpMat, Lower | Upper> cg;
+  cg.setMaxIterations(10000);
+  cg.setTolerance(RealScalar(1e-8));
+  cg.compute(A);
+
+  int last_iters = 0;
+  for (auto _ : state) {
+    Vec x = cg.solve(b);
+    benchmark::DoNotOptimize(x.data());
+    last_iters = cg.iterations();
+  }
+  state.counters["n"] = A.rows();
+  state.counters["nnz"] = A.nonZeros();
+  state.counters["iters"] = last_iters;
+  state.counters["error"] = cg.error();
+}
+
+// ---- GPU CG -----------------------------------------------------------------
+
+template <typename Scalar, typename MatGen>
+void run_gpu_cg(benchmark::State& state, MatGen make_matrix) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  cuda_warmup();
+  const int grid_n = state.range(0);
+  SpMat A = make_matrix(grid_n);
+  const Index n = A.rows();
+  Vec b = Vec::Random(n);
+
+  // Extract inverse diagonal.
+  Vec invdiag(n);
+  for (Index j = 0; j < A.outerSize(); ++j) {
+    typename SpMat::InnerIterator it(A, j);
+    while (it && it.index() != j) ++it;
+    if (it && it.index() == j && it.value() != Scalar(0))
+      invdiag(j) = Scalar(1) / it.value();
+    else
+      invdiag(j) = Scalar(1);
+  }
+
+  GpuContext ctx;
+  GpuContext::setThreadLocal(&ctx);
+  GpuSparseContext<Scalar> spmv_ctx(ctx);
+  auto mat = spmv_ctx.deviceView(A);
+  auto d_invdiag = DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());
+  auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
+
+  int last_iters = 0;
+  RealScalar last_error = 0;
+
+  for (auto _ : state) {
+    DeviceMatrix<Scalar> d_x(n, 1);
+    d_x.setZero(ctx);
+    DeviceMatrix<Scalar> residual(n, 1);
+    residual.copyFrom(ctx, d_b);
+
+    RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
+    RealScalar tol = RealScalar(1e-8);
+    RealScalar threshold = tol * tol * rhsNorm2;
+    RealScalar residualNorm2 = residual.squaredNorm(ctx);
+
+    DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
+    DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
+
+    auto absNew = residual.dot(ctx, p);
+    Index i = 0;
+    Index maxIters = 10000;
+    while (i < maxIters) {
+      tmp.noalias() = mat * p;
+      auto alpha = absNew / p.dot(ctx, tmp);
+      d_x += alpha * p;
+      residual -= alpha * tmp;
+
+      residualNorm2 = residual.squaredNorm(ctx);
+      if (residualNorm2 < threshold) break;
+
+      z.cwiseProduct(ctx, d_invdiag, residual);
+      auto absOld = std::move(absNew);
+      absNew = residual.dot(ctx, z);
+      auto beta = absNew / absOld;
+
+      p *= beta;
+      p += z;
+      i++;
+    }
+    benchmark::DoNotOptimize(d_x.data());
+    last_iters = i;
+    last_error = numext::sqrt(residualNorm2 / rhsNorm2);
+  }
+
+  GpuContext::setThreadLocal(nullptr);
+  state.counters["n"] = n;
+  state.counters["nnz"] = A.nonZeros();
+  state.counters["iters"] = last_iters;
+  state.counters["error"] = last_error;
+}
+
+// ---- 2D Laplacian, double ---------------------------------------------------
+
+static void BM_CG_CPU_2D_double(benchmark::State& state) { run_cpu_cg<double>(state, make_laplacian_2d<double>); }
+static void BM_CG_GPU_2D_double(benchmark::State& state) { run_gpu_cg<double>(state, make_laplacian_2d<double>); }
+
+BENCHMARK(BM_CG_CPU_2D_double)->ArgsProduct({{32, 64, 128, 256, 512}});
+BENCHMARK(BM_CG_GPU_2D_double)->ArgsProduct({{32, 64, 128, 256, 512}});
+
+// ---- 2D Laplacian, float ----------------------------------------------------
+
+static void BM_CG_CPU_2D_float(benchmark::State& state) { run_cpu_cg<float>(state, make_laplacian_2d<float>); }
+static void BM_CG_GPU_2D_float(benchmark::State& state) { run_gpu_cg<float>(state, make_laplacian_2d<float>); }
+
+BENCHMARK(BM_CG_CPU_2D_float)->ArgsProduct({{32, 64, 128, 256, 512}});
+BENCHMARK(BM_CG_GPU_2D_float)->ArgsProduct({{32, 64, 128, 256, 512}});
+
+// ---- 3D Laplacian, double ---------------------------------------------------
+
+static void BM_CG_CPU_3D_double(benchmark::State& state) { run_cpu_cg<double>(state, make_laplacian_3d<double>); }
+static void BM_CG_GPU_3D_double(benchmark::State& state) { run_gpu_cg<double>(state, make_laplacian_3d<double>); }
+
+BENCHMARK(BM_CG_CPU_3D_double)->ArgsProduct({{16, 32, 48, 64}});
+BENCHMARK(BM_CG_GPU_3D_double)->ArgsProduct({{16, 32, 48, 64}});
+
+// ---- 3D Laplacian, float ----------------------------------------------------
+
+static void BM_CG_CPU_3D_float(benchmark::State& state) { run_cpu_cg<float>(state, make_laplacian_3d<float>); }
+static void BM_CG_GPU_3D_float(benchmark::State& state) { run_gpu_cg<float>(state, make_laplacian_3d<float>); }
+
+BENCHMARK(BM_CG_CPU_3D_float)->ArgsProduct({{16, 32, 48, 64}});
+BENCHMARK(BM_CG_GPU_3D_float)->ArgsProduct({{16, 32, 48, 64}});
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -481,13 +481,13 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
  ei_add_test(gpu_basic)
  ei_add_test(gpu_library_example "" "CUDA::cusolver")

-  # DeviceMatrix tests: only CUDA runtime, no NVIDIA libraries.
+  # DeviceMatrix tests: CUDA runtime + cuBLAS + cuSOLVER (for BLAS-1 ops via GpuContext).
  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
  add_executable(gpu_device_matrix gpu_device_matrix.cpp)
  target_include_directories(gpu_device_matrix PRIVATE
    "${CUDA_TOOLKIT_ROOT_DIR}/include"
    "${CMAKE_CURRENT_BINARY_DIR}")
-  target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart)
+  target_link_libraries(gpu_device_matrix Eigen3::Eigen CUDA::cudart CUDA::cublas CUDA::cusolver CUDA::npps CUDA::nppc)
  target_compile_definitions(gpu_device_matrix PRIVATE
    EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
    EIGEN_TEST_PART_ALL=1)
@@ -575,7 +575,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
      "${CUDA_TOOLKIT_ROOT_DIR}/include"
      "${CMAKE_CURRENT_BINARY_DIR}")
    target_link_libraries(gpu_cusparse_spmv
-      Eigen3::Eigen CUDA::cudart CUDA::cusparse)
+      Eigen3::Eigen CUDA::cudart CUDA::cusparse CUDA::cublas CUDA::cusolver)
    target_compile_definitions(gpu_cusparse_spmv PRIVATE
      EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
      EIGEN_TEST_PART_ALL=1)
@@ -584,6 +584,23 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
    add_dependencies(buildtests_gpu gpu_cusparse_spmv)
    set_property(TEST gpu_cusparse_spmv APPEND PROPERTY LABELS "Official;gpu")
    set_property(TEST gpu_cusparse_spmv PROPERTY SKIP_RETURN_CODE 77)
+
+    # End-to-end GPU CG test: Eigen's ConjugateGradient with DeviceMatrix.
+    add_executable(gpu_cg gpu_cg.cpp)
+    target_include_directories(gpu_cg PRIVATE
+      "${CUDA_TOOLKIT_ROOT_DIR}/include"
+      "${CMAKE_CURRENT_BINARY_DIR}")
+    target_link_libraries(gpu_cg
+      Eigen3::Eigen CUDA::cudart CUDA::cusparse CUDA::cublas CUDA::cusolver CUDA::npps CUDA::nppc)
+    target_compile_definitions(gpu_cg PRIVATE
+      EIGEN_TEST_MAX_SIZE=${EIGEN_TEST_MAX_SIZE}
+      EIGEN_TEST_PART_ALL=1)
+    add_test(NAME gpu_cg COMMAND gpu_cg)
+    add_dependencies(buildtests gpu_cg)
+    add_dependencies(buildtests_gpu gpu_cg)
+    set_property(TEST gpu_cg APPEND PROPERTY LABELS "Official;gpu")
+    set_property(TEST gpu_cg PROPERTY SKIP_RETURN_CODE 77)
+
    set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
  endif()

--- a/test/gpu_cg.cpp
+++ b/test/gpu_cg.cpp
@@ -0,0 +1,224 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// End-to-end test: CG algorithm running on GPU via DeviceMatrix.
+//
+// Uses DeviceSparseView for SpMV, DeviceMatrix for vectors, DeviceScalar
+// for deferred reductions. Verifies correctness against CPU ConjugateGradient.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/Sparse>
+#include <Eigen/IterativeLinearSolvers>
+#include <Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Helper: build a sparse SPD matrix --------------------------------------
+
+template <typename Scalar>
+SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat R(n, n);
+  R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
+  for (Index j = 0; j < n; ++j) {
+    for (Index i = 0; i < n; ++i) {
+      if (i == j || (std::rand() / double(RAND_MAX)) < density) {
+        R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
+      }
+    }
+  }
+  R.makeCompressed();
+  SpMat A = R.adjoint() * R;
+  for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
+  A.makeCompressed();
+  return A;
+}
+
+// ---- GPU CG without preconditioner ------------------------------------------
+
+template <typename Scalar>
+void test_gpu_cg(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_spd<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  // CPU reference (identity preconditioner to match GPU).
+  ConjugateGradient<SpMat, Lower | Upper, IdentityPreconditioner> cpu_cg;
+  cpu_cg.setMaxIterations(1000);
+  cpu_cg.setTolerance(RealScalar(1e-8));
+  cpu_cg.compute(A);
+  Vec x_cpu = cpu_cg.solve(b);
+  VERIFY_IS_EQUAL(cpu_cg.info(), Success);
+
+  // GPU CG: mirrors Eigen's conjugate_gradient() using DeviceMatrix ops.
+  GpuContext ctx;
+  GpuContext::setThreadLocal(&ctx);
+  GpuSparseContext<Scalar> spmv_ctx(ctx);
+  auto mat = spmv_ctx.deviceView(A);
+
+  auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
+  DeviceMatrix<Scalar> d_x(n, 1);
+  d_x.setZero(ctx);
+
+  // r = b (since x=0)
+  DeviceMatrix<Scalar> residual(n, 1);
+  residual.copyFrom(ctx, d_b);
+
+  RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
+  RealScalar tol = RealScalar(1e-8);
+  RealScalar threshold = tol * tol * rhsNorm2;
+  RealScalar residualNorm2 = residual.squaredNorm(ctx);
+
+  // p = r (no preconditioner)
+  DeviceMatrix<Scalar> p(n, 1);
+  p.copyFrom(ctx, residual);
+  DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
+
+  auto absNew = residual.dot(ctx, p);
+  Index maxIters = 1000;
+  Index i = 0;
+  while (i < maxIters) {
+    tmp.noalias() = mat * p;
+
+    auto alpha = absNew / p.dot(ctx, tmp);
+    d_x += alpha * p;
+    residual -= alpha * tmp;
+
+    residualNorm2 = residual.squaredNorm(ctx);
+    if (residualNorm2 < threshold) break;
+
+    // z = r (no preconditioner)
+    z.copyFrom(ctx, residual);
+
+    auto absOld = std::move(absNew);
+    absNew = residual.dot(ctx, z);
+    auto beta = absNew / absOld;
+
+    p *= Scalar(beta);
+    p += z;
+    i++;
+  }
+
+  GpuContext::setThreadLocal(nullptr);
+
+  Vec x_gpu = d_x.toHost(ctx.stream());
+
+  // Verify residual.
+  Vec r = A * x_gpu - b;
+  RealScalar relres = r.norm() / b.norm();
+  VERIFY(relres < RealScalar(1e-6));
+
+  // Compare with CPU.
+  RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
+}
+
+// ---- GPU CG with Jacobi preconditioner --------------------------------------
+
+template <typename Scalar>
+void test_gpu_cg_jacobi(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_spd<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  // CPU reference.
+  ConjugateGradient<SpMat, Lower | Upper> cpu_cg;
+  cpu_cg.setMaxIterations(1000);
+  cpu_cg.setTolerance(RealScalar(1e-8));
+  cpu_cg.compute(A);
+  Vec x_cpu = cpu_cg.solve(b);
+
+  // Extract inverse diagonal.
+  Vec invdiag(n);
+  for (Index j = 0; j < A.outerSize(); ++j) {
+    typename SpMat::InnerIterator it(A, j);
+    while (it && it.index() != j) ++it;
+    if (it && it.index() == j && it.value() != Scalar(0))
+      invdiag(j) = Scalar(1) / it.value();
+    else
+      invdiag(j) = Scalar(1);
+  }
+
+  // GPU CG with Jacobi preconditioner.
+  GpuContext ctx;
+  GpuContext::setThreadLocal(&ctx);
+  GpuSparseContext<Scalar> spmv_ctx(ctx);
+  auto mat = spmv_ctx.deviceView(A);
+  auto d_invdiag = DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());
+
+  auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
+  DeviceMatrix<Scalar> d_x(n, 1);
+  d_x.setZero(ctx);
+
+  DeviceMatrix<Scalar> residual(n, 1);
+  residual.copyFrom(ctx, d_b);
+
+  RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
+  RealScalar tol = RealScalar(1e-8);
+  RealScalar threshold = tol * tol * rhsNorm2;
+  RealScalar residualNorm2 = residual.squaredNorm(ctx);
+
+  // p = precond.solve(r) = invdiag .* r
+  DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
+  DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);
+
+  auto absNew = residual.dot(ctx, p);
+  Index maxIters = 1000;
+  Index i = 0;
+  while (i < maxIters) {
+    tmp.noalias() = mat * p;
+
+    auto alpha = absNew / p.dot(ctx, tmp);
+    d_x += alpha * p;
+    residual -= alpha * tmp;
+
+    residualNorm2 = residual.squaredNorm(ctx);
+    if (residualNorm2 < threshold) break;
+
+    // z = precond.solve(r) = invdiag .* r
+    z.cwiseProduct(ctx, d_invdiag, residual);
+
+    auto absOld = std::move(absNew);
+    absNew = residual.dot(ctx, z);
+    auto beta = absNew / absOld;
+
+    p *= beta;
+    p += z;
+    i++;
+  }
+
+  GpuContext::setThreadLocal(nullptr);
+
+  Vec x_gpu = d_x.toHost(ctx.stream());
+
+  Vec r = A * x_gpu - b;
+  RealScalar relres = r.norm() / b.norm();
+  VERIFY(relres < RealScalar(1e-6));
+
+  RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
+}
+
+EIGEN_DECLARE_TEST(gpu_cg) {
+  CALL_SUBTEST(test_gpu_cg<double>(64));
+  CALL_SUBTEST(test_gpu_cg<double>(256));
+  CALL_SUBTEST(test_gpu_cg<float>(64));
+  CALL_SUBTEST(test_gpu_cg_jacobi<double>(64));
+  CALL_SUBTEST(test_gpu_cg_jacobi<double>(256));
+  CALL_SUBTEST(test_gpu_cg_jacobi<float>(64));
+}
--- a/test/gpu_cusparse_spmv.cpp
+++ b/test/gpu_cusparse_spmv.cpp
@@ -180,6 +180,105 @@ void test_empty() {
  VERIFY_IS_EQUAL(y.size(), 0);
 }

+// ---- DeviceMatrix SpMV (no host roundtrip) ----------------------------------
+
+template <typename Scalar>
+void test_spmv_device(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_sparse<Scalar>(n, n);
+  Vec x = Vec::Random(n);
+
+  // Use shared GpuContext for same-stream execution.
+  GpuContext gpu_ctx;
+  GpuSparseContext<Scalar> ctx(gpu_ctx);
+
+  auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
+  DeviceMatrix<Scalar> d_y;
+
+  ctx.multiply(A, d_x, d_y);
+
+  Vec y_gpu = d_y.toHost(gpu_ctx.stream());
+  Vec y_cpu = A * x;
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+}
+
+// ---- Expression syntax: d_y = d_A * d_x ------------------------------------
+
+template <typename Scalar>
+void test_spmv_expr(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_sparse<Scalar>(n, n);
+  Vec x = Vec::Random(n);
+
+  GpuContext gpu_ctx;
+  GpuSparseContext<Scalar> ctx(gpu_ctx);
+
+  // Upload sparse matrix and create device view.
+  auto d_A = ctx.deviceView(A);
+
+  // Upload x.
+  auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
+
+  // Expression syntax: d_y = d_A * d_x
+  DeviceMatrix<Scalar> d_y;
+  d_y = d_A * d_x;
+
+  // Also test with noalias():
+  DeviceMatrix<Scalar> d_tmp;
+  d_tmp.noalias() = d_A * d_x;
+
+  Vec y_gpu = d_y.toHost(gpu_ctx.stream());
+  Vec tmp_gpu = d_tmp.toHost(gpu_ctx.stream());
+  Vec y_cpu = A * x;
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+  VERIFY((tmp_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+}
+
+// ---- deviceView overwrite: second view replaces first -----------------------
+
+template <typename Scalar>
+void test_deviceview_overwrite(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A1 = make_sparse<Scalar>(n, n);
+  SpMat A2 = make_sparse<Scalar>(n, n);  // different random matrix
+
+  Vec x = Vec::Random(n);
+
+  GpuContext gpu_ctx;
+  GpuSparseContext<Scalar> ctx(gpu_ctx);
+
+  // First view: A1.
+  auto d_A1 = ctx.deviceView(A1);
+  auto d_x = DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
+  DeviceMatrix<Scalar> d_y1;
+  d_y1 = d_A1 * d_x;
+  Vec y1_gpu = d_y1.toHost(gpu_ctx.stream());
+  Vec y1_cpu = A1 * x;
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((y1_gpu - y1_cpu).norm() / (y1_cpu.norm() + RealScalar(1)) < tol);
+
+  // Second view overwrites first: now uses A2.
+  auto d_A2 = ctx.deviceView(A2);
+  DeviceMatrix<Scalar> d_y2;
+  d_y2 = d_A2 * d_x;
+  Vec y2_gpu = d_y2.toHost(gpu_ctx.stream());
+  Vec y2_cpu = A2 * x;
+  VERIFY((y2_gpu - y2_cpu).norm() / (y2_cpu.norm() + RealScalar(1)) < tol);
+}
+
 // ---- Per-scalar driver ------------------------------------------------------

 template <typename Scalar>
@@ -193,6 +292,9 @@ void test_scalar() {
  CALL_SUBTEST(test_identity<Scalar>(64));
  CALL_SUBTEST(test_reuse<Scalar>(64));
  CALL_SUBTEST(test_empty<Scalar>());
+  CALL_SUBTEST(test_spmv_device<Scalar>(64));
+  CALL_SUBTEST(test_spmv_expr<Scalar>(64));
+  CALL_SUBTEST(test_deviceview_overwrite<Scalar>(64));
 }

 EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {
--- a/test/gpu_device_matrix.cpp
+++ b/test/gpu_device_matrix.cpp
@@ -12,6 +12,7 @@

 #define EIGEN_USE_GPU
 #include "main.h"
+#include <Eigen/Sparse>
 #include <Eigen/GPU>

 using namespace Eigen;
@@ -230,6 +231,217 @@ void test_scalar() {
  CALL_SUBTEST(test_move_assign<Scalar>(64, 64));
 }

+// ---- BLAS-1: dot product ----------------------------------------------------
+
+template <typename Scalar>
+void test_blas1(Index n) {
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  // All BLAS-1 ops share one GpuContext — same stream, zero event overhead.
+  GpuContext ctx;
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+
+  // dot
+  {
+    Vec a = Vec::Random(n);
+    Vec b = Vec::Random(n);
+    auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
+    auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
+    Scalar gpu_dot = d_a.dot(ctx, d_b);
+    Scalar cpu_dot = a.dot(b);
+    VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
+  }
+
+  // norm / squaredNorm
+  {
+    Vec a = Vec::Random(n);
+    auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
+    RealScalar gpu_norm = d_a.norm(ctx);
+    RealScalar cpu_norm = a.norm();
+    VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol);
+    RealScalar gpu_sqnorm = d_a.squaredNorm(ctx);
+    RealScalar cpu_sqnorm = a.squaredNorm();
+    VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol);
+  }
+
+  // addScaled (axpy)
+  {
+    Vec x = Vec::Random(n);
+    Vec y = Vec::Random(n);
+    Scalar alpha(2.5);
+    Vec y_ref = y + alpha * x;
+    auto d_y = DeviceMatrix<Scalar>::fromHost(y, ctx.stream());
+    auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
+    d_y.addScaled(ctx, alpha, d_x);
+    Vec y_gpu = d_y.toHost(ctx.stream());
+    VERIFY((y_gpu - y_ref).norm() < tol * y_ref.norm() + tol);
+  }
+
+  // scale (scal)
+  {
+    Vec x = Vec::Random(n);
+    Scalar alpha(3.0);
+    Vec x_ref = alpha * x;
+    auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
+    d_x.scale(ctx, alpha);
+    Vec x_gpu = d_x.toHost(ctx.stream());
+    VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol);
+  }
+
+  // copyFrom
+  {
+    Vec x = Vec::Random(n);
+    auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
+    DeviceMatrix<Scalar> d_y;
+    d_y.copyFrom(ctx, d_x);
+    Vec y = d_y.toHost(ctx.stream());
+    VERIFY_IS_APPROX(y, x);
+  }
+
+  // setZero
+  {
+    Vec x = Vec::Random(n);
+    auto d_x = DeviceMatrix<Scalar>::fromHost(x, ctx.stream());
+    d_x.setZero(ctx);
+    Vec result = d_x.toHost(ctx.stream());
+    VERIFY_IS_EQUAL(result, Vec::Zero(n));
+  }
+}
+
+// ---- BLAS-1 operator overloads (CG-style) -----------------------------------
+
+template <typename Scalar>
+void test_cg_operators(Index n) {
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+
+  Vec x = Vec::Random(n);
+  Vec p = Vec::Random(n);
+  Vec tmp = Vec::Random(n);
+  Vec z = Vec::Random(n);
+  Scalar alpha(2.5);
+  Scalar beta(0.7);
+
+  // Test: x += alpha * p
+  {
+    Vec x_ref = x + alpha * p;
+    auto d_x = DeviceMatrix<Scalar>::fromHost(x);
+    auto d_p = DeviceMatrix<Scalar>::fromHost(p);
+    d_x += alpha * d_p;
+    Vec x_gpu = d_x.toHost();
+    VERIFY((x_gpu - x_ref).norm() < tol * x_ref.norm() + tol);
+  }
+
+  // Test: r -= alpha * tmp
+  {
+    Vec r = Vec::Random(n);
+    Vec r_ref = r - alpha * tmp;
+    auto d_r = DeviceMatrix<Scalar>::fromHost(r);
+    auto d_tmp = DeviceMatrix<Scalar>::fromHost(tmp);
+    d_r -= alpha * d_tmp;
+    Vec r_gpu = d_r.toHost();
+    VERIFY((r_gpu - r_ref).norm() < tol * r_ref.norm() + tol);
+  }
+
+  // Test: p = z + beta * p  (cuBLAS geam)
+  {
+    Vec p_copy = p;
+    Vec p_ref = z + beta * p_copy;
+    auto d_p = DeviceMatrix<Scalar>::fromHost(p_copy);
+    auto d_z = DeviceMatrix<Scalar>::fromHost(z);
+    d_p = d_z + beta * d_p;
+    Vec p_gpu = d_p.toHost();
+    VERIFY((p_gpu - p_ref).norm() < tol * p_ref.norm() + tol);
+  }
+
+  // Test: operator+= and operator-= with DeviceMatrix (no scalar)
+  {
+    Vec a = Vec::Random(n);
+    Vec b = Vec::Random(n);
+    Vec a_ref = a + b;
+    auto d_a = DeviceMatrix<Scalar>::fromHost(a);
+    auto d_b = DeviceMatrix<Scalar>::fromHost(b);
+    d_a += d_b;
+    VERIFY((d_a.toHost() - a_ref).norm() < tol * a_ref.norm() + tol);
+  }
+}
+
+// ---- DeviceScalar: deferred sync -------------------------------------------
+
+template <typename Scalar>
+void test_device_scalar() {
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  const Index n = 256;
+  Vec a = Vec::Random(n);
+  Vec b = Vec::Random(n);
+
+  GpuContext ctx;
+  auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
+  auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
+
+  // dot() returns DeviceScalar — implicit conversion to Scalar syncs.
+  Scalar gpu_dot = d_a.dot(ctx, d_b);
+  Scalar cpu_dot = a.dot(b);
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(numext::abs(gpu_dot - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
+
+  // squaredNorm() returns host RealScalar directly (syncs internally).
+  RealScalar gpu_sqnorm = d_a.squaredNorm(ctx);
+  RealScalar cpu_sqnorm = a.squaredNorm();
+  VERIFY(numext::abs(gpu_sqnorm - cpu_sqnorm) < tol * cpu_sqnorm + tol);
+
+  // norm() returns DeviceScalar<RealScalar> — implicit conversion syncs.
+  RealScalar gpu_norm = d_a.norm(ctx);
+  RealScalar cpu_norm = a.norm();
+  VERIFY(numext::abs(gpu_norm - cpu_norm) < tol * cpu_norm + tol);
+
+  // Convenience overloads (thread-local context).
+  GpuContext::setThreadLocal(&ctx);
+  Scalar gpu_dot2 = d_a.dot(d_b);
+  VERIFY(numext::abs(gpu_dot2 - cpu_dot) < tol * numext::abs(cpu_dot) + tol);
+  GpuContext::setThreadLocal(nullptr);
+
+  // Empty vectors: dot and norm must return zero.
+  {
+    DeviceMatrix<Scalar> d_empty(0, 1);
+    DeviceMatrix<Scalar> d_empty2(0, 1);
+    Scalar empty_dot = d_empty.dot(ctx, d_empty2);
+    VERIFY_IS_EQUAL(empty_dot, Scalar(0));
+    RealScalar empty_sqnorm = d_empty.squaredNorm(ctx);
+    VERIFY_IS_EQUAL(empty_sqnorm, RealScalar(0));
+    RealScalar empty_norm = d_empty.norm(ctx);
+    VERIFY_IS_EQUAL(empty_norm, RealScalar(0));
+  }
+}
+
+// ---- cwiseProduct -----------------------------------------------------------
+
+template <typename Scalar>
+void test_cwiseProduct() {
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  const Index n = 256;
+  Vec a = Vec::Random(n);
+  Vec b = Vec::Random(n);
+  Vec ref = a.array() * b.array();
+
+  GpuContext ctx;
+  auto d_a = DeviceMatrix<Scalar>::fromHost(a, ctx.stream());
+  auto d_b = DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
+  auto d_c = d_a.cwiseProduct(ctx, d_b);
+  Vec result = d_c.toHost(ctx.stream());
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((result - ref).norm() < tol * ref.norm() + tol);
+}
+
 EIGEN_DECLARE_TEST(gpu_device_matrix) {
  CALL_SUBTEST(test_default_construct());
  CALL_SUBTEST(test_empty());
@@ -242,4 +454,18 @@ EIGEN_DECLARE_TEST(gpu_device_matrix) {
  CALL_SUBTEST(test_scalar<double>());
  CALL_SUBTEST(test_scalar<std::complex<float>>());
  CALL_SUBTEST(test_scalar<std::complex<double>>());
+  CALL_SUBTEST(test_blas1<float>(256));
+  CALL_SUBTEST(test_blas1<double>(256));
+  CALL_SUBTEST(test_blas1<std::complex<float>>(256));
+  CALL_SUBTEST(test_blas1<std::complex<double>>(256));
+  CALL_SUBTEST(test_cg_operators<float>(256));
+  CALL_SUBTEST(test_cg_operators<double>(256));
+  CALL_SUBTEST(test_cg_operators<std::complex<float>>(256));
+  CALL_SUBTEST(test_cg_operators<std::complex<double>>(256));
+  CALL_SUBTEST(test_device_scalar<float>());
+  CALL_SUBTEST(test_device_scalar<double>());
+  CALL_SUBTEST(test_device_scalar<std::complex<float>>());
+  CALL_SUBTEST(test_device_scalar<std::complex<double>>());
+  CALL_SUBTEST(test_cwiseProduct<float>());
+  CALL_SUBTEST(test_cwiseProduct<double>());
 }