Files
eigen/test/gpu_cublas.cpp
Rasmus Munk Larsen 58c44ef36d GPU: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)
Add Eigen/GPU module: A standalone GPU library dispatch layer where
DeviceMatrix<Scalar> operations map 1:1 to cuBLAS/cuSOLVER calls.
CPU and GPU solvers coexist in the same binary with compatible syntax.

Core infrastructure:
- DeviceMatrix<Scalar>: RAII dense column-major GPU memory wrapper with
  async host transfer (fromHost/toHost) and CUDA event-based cross-stream
  synchronization.
- GpuContext: Unified execution context owning a CUDA stream + cuBLAS
  handle + cuSOLVER handle. Thread-local default with explicit override
  via setThreadLocal(). Stream-borrowing constructor for integration.
- DeviceBuffer: Typed RAII device allocation with move semantics.

cuBLAS dispatch (expression syntax):
- GEMM: d_C = d_A.adjoint() * d_B (cublasXgemm)
- TRSM: d_X = d_A.triangularView<Lower>().solve(d_B) (cublasXtrsm)
- SYMM/HEMM: d_C = d_A.selfadjointView<Lower>() * d_B (cublasXsymm)
- SYRK/HERK: d_C = d_A * d_A.adjoint() (cublasXsyrk)

cuSOLVER dispatch:
- GpuLLT: Cached Cholesky factorization (cusolverDnXpotrf + Xpotrs)
- GpuLU: Cached LU factorization (cusolverDnXgetrf + Xgetrs)
- Solver chaining: auto x = d_A.llt().solve(d_B)
- Solver expressions with .device(ctx) for explicit stream control.

CI: Bump CUDA container to Ubuntu 22.04 (CMake 3.22), GCC 10->11,
Clang 12->14. Bump cmake_minimum_required to 3.17 for FindCUDAToolkit.

Tests: gpu_cublas.cpp, gpu_cusolver_llt.cpp, gpu_cusolver_lu.cpp,
gpu_device_matrix.cpp, gpu_library_example.cu
Benchmarks: bench_gpu_solvers.cpp, bench_gpu_chaining.cpp,
bench_gpu_batching.cpp
2026-04-09 19:05:25 -07:00

729 lines
22 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Tests for cuBLAS GEMM dispatch via DeviceMatrix expression syntax.
// Covers: d_C = d_A * d_B, adjoint, transpose, scaled, +=, .device(ctx).
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/GPU>
using namespace Eigen;
// ---- Basic GEMM: C = A * B -------------------------------------------------
template <typename Scalar>
void test_gemm_basic(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
// Expression: d_C = d_A * d_B
DeviceMatrix<Scalar> d_C;
d_C = d_A * d_B;
Mat C = d_C.toHost();
Mat C_ref = A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM with adjoint: C = A^H * B ----------------------------------------
template <typename Scalar>
void test_gemm_adjoint_lhs(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(k, m); // A is k×m, A^H is m×k
Mat B = Mat::Random(k, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_C;
d_C = d_A.adjoint() * d_B;
Mat C = d_C.toHost();
Mat C_ref = A.adjoint() * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM with transpose: C = A * B^T --------------------------------------
template <typename Scalar>
void test_gemm_transpose_rhs(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(n, k); // B is n×k, B^T is k×n
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_C;
d_C = d_A * d_B.transpose();
Mat C = d_C.toHost();
Mat C_ref = A * B.transpose();
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM with scaled: C = alpha * A * B ------------------------------------
template <typename Scalar>
void test_gemm_scaled(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
Scalar alpha = Scalar(2.5);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_C;
d_C = alpha * d_A * d_B;
Mat C = d_C.toHost();
Mat C_ref = alpha * A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM accumulate: C += A * B (beta=1) -----------------------------------
template <typename Scalar>
void test_gemm_accumulate(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
Mat C_init = Mat::Random(m, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
d_C += d_A * d_B;
Mat C = d_C.toHost();
Mat C_ref = C_init + A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM accumulate into empty destination ---------------------------------
template <typename Scalar>
void test_gemm_accumulate_empty(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_C;
d_C += d_A * d_B;
Mat C = d_C.toHost();
Mat C_ref = A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM subtract: C -= A * B (beta=1, alpha=-1) --------------------------
template <typename Scalar>
void test_gemm_subtract(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
Mat C_init = Mat::Random(m, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
auto d_C = DeviceMatrix<Scalar>::fromHost(C_init);
GpuContext ctx;
d_C.device(ctx) -= d_A * d_B;
Mat C = d_C.toHost();
Mat C_ref = C_init - A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM subtract from empty destination -----------------------------------
template <typename Scalar>
void test_gemm_subtract_empty(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
GpuContext ctx;
DeviceMatrix<Scalar> d_C;
d_C.device(ctx) -= d_A * d_B;
Mat C = d_C.toHost();
Mat C_ref = -(A * B);
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM with scaled RHS: C = A * (alpha * B) -----------------------------
template <typename Scalar>
void test_gemm_scaled_rhs(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
Scalar alpha = Scalar(3.0);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_C;
d_C = d_A * (alpha * d_B);
Mat C = d_C.toHost();
Mat C_ref = A * (alpha * B);
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM dimension mismatch must assert ------------------------------------
template <typename Scalar>
void test_gemm_dimension_mismatch() {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
Mat A = Mat::Random(8, 5);
Mat B = Mat::Random(6, 7); // inner dimension mismatch
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_C;
VERIFY_RAISES_ASSERT(d_C = d_A * d_B);
}
// ---- GEMM with explicit GpuContext ------------------------------------------
template <typename Scalar>
void test_gemm_explicit_context(Index m, Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(m, k);
Mat B = Mat::Random(k, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
GpuContext ctx;
DeviceMatrix<Scalar> d_C;
d_C.device(ctx) = d_A * d_B;
Mat C = d_C.toHost();
Mat C_ref = A * B;
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM cross-context reuse of the same destination -----------------------
template <typename Scalar>
void test_gemm_cross_context_reuse(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, n);
Mat D = Mat::Random(n, n);
Mat E = Mat::Random(n, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
GpuContext ctx1;
GpuContext ctx2;
DeviceMatrix<Scalar> d_C;
d_C.device(ctx1) = d_A * d_B;
d_C.device(ctx2) += d_D * d_E;
Mat C = d_C.toHost();
Mat C_ref = A * B + D * E;
RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm();
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM cross-context resize of the destination ---------------------------
template <typename Scalar>
void test_gemm_cross_context_resize() {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(64, 64);
Mat B = Mat::Random(64, 64);
Mat D = Mat::Random(32, 16);
Mat E = Mat::Random(16, 8);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
GpuContext ctx1;
GpuContext ctx2;
DeviceMatrix<Scalar> d_C;
d_C.device(ctx1) = d_A * d_B;
d_C.device(ctx2) = d_D * d_E;
Mat C = d_C.toHost();
Mat C_ref = D * E;
RealScalar tol = RealScalar(16) * NumTraits<Scalar>::epsilon() * C_ref.norm();
VERIFY((C - C_ref).norm() < tol);
}
// ---- GEMM chaining: C = (A * B) then D = C * E -----------------------------
template <typename Scalar>
void test_gemm_chain(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, n);
Mat E = Mat::Random(n, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
auto d_E = DeviceMatrix<Scalar>::fromHost(E);
DeviceMatrix<Scalar> d_C;
d_C = d_A * d_B;
DeviceMatrix<Scalar> d_D;
d_D = d_C * d_E;
Mat D = d_D.toHost();
Mat D_ref = (A * B) * E;
RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * D_ref.norm();
VERIFY((D - D_ref).norm() < tol);
}
// ---- Square identity check: A * I = A ---------------------------------------
template <typename Scalar>
void test_gemm_identity(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
Mat A = Mat::Random(n, n);
Mat eye = Mat::Identity(n, n);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_I = DeviceMatrix<Scalar>::fromHost(eye);
DeviceMatrix<Scalar> d_C;
d_C = d_A * d_I;
Mat C = d_C.toHost();
VERIFY_IS_APPROX(C, A);
}
// ---- LLT solve expression: d_X = d_A.llt().solve(d_B) ----------------------
template <typename MatrixType>
MatrixType make_spd(Index n) {
using Scalar = typename MatrixType::Scalar;
MatrixType M = MatrixType::Random(n, n);
return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
}
template <typename Scalar>
void test_llt_solve_expr(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = make_spd<Mat>(n);
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_X;
d_X = d_A.llt().solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
}
// ---- LLT solve with explicit context ----------------------------------------
template <typename Scalar>
void test_llt_solve_expr_context(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = make_spd<Mat>(n);
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
GpuContext ctx;
DeviceMatrix<Scalar> d_X;
d_X.device(ctx) = d_A.llt().solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
}
// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------
template <typename Scalar>
void test_lu_solve_expr(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_X;
d_X = d_A.lu().solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
}
// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------
template <typename Scalar>
void test_gemm_then_solve(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat D = Mat::Random(n, 1);
// Make SPD: C = A^H * A + n*I
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
DeviceMatrix<Scalar> d_C;
d_C = d_A.adjoint() * d_A;
// Add n*I on host (no element-wise ops on DeviceMatrix yet).
Mat C = d_C.toHost();
C += Mat::Identity(n, n) * static_cast<Scalar>(n);
d_C = DeviceMatrix<Scalar>::fromHost(C);
auto d_D = DeviceMatrix<Scalar>::fromHost(D);
DeviceMatrix<Scalar> d_X;
d_X = d_C.llt().solve(d_D);
Mat X = d_X.toHost();
RealScalar residual = (C * X - D).norm() / D.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
}
// ---- LLT solve with Upper triangle -----------------------------------------
template <typename Scalar>
void test_llt_solve_upper(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = make_spd<Mat>(n);
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_X;
d_X = d_A.template llt<Upper>().solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
}
// ---- LU solve with explicit context -----------------------------------------
template <typename Scalar>
void test_lu_solve_expr_context(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
GpuContext ctx;
DeviceMatrix<Scalar> d_X;
d_X.device(ctx) = d_A.lu().solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
}
// ---- Zero-nrhs solver expressions ------------------------------------------
template <typename Scalar>
void test_llt_solve_zero_nrhs(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
Mat A = make_spd<Mat>(n);
Mat B = Mat::Random(n, 0);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_X;
d_X = d_A.llt().solve(d_B);
VERIFY_IS_EQUAL(d_X.rows(), n);
VERIFY_IS_EQUAL(d_X.cols(), 0);
}
template <typename Scalar>
void test_lu_solve_zero_nrhs(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, 0);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_X;
d_X = d_A.lu().solve(d_B);
VERIFY_IS_EQUAL(d_X.rows(), n);
VERIFY_IS_EQUAL(d_X.cols(), 0);
}
// ---- TRSM: triangularView<UpLo>().solve(B) ----------------------------------
template <typename Scalar, int UpLo>
void test_trsm(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
// Build a well-conditioned triangular matrix.
Mat A = Mat::Random(n, n);
A.diagonal().array() += static_cast<Scalar>(n); // ensure non-singular
if (UpLo == Lower)
A = A.template triangularView<Lower>();
else
A = A.template triangularView<Upper>();
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_X;
d_X = d_A.template triangularView<UpLo>().solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / B.norm();
VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
}
// ---- SYMM/HEMM: selfadjointView<UpLo>() * B --------------------------------
template <typename Scalar, int UpLo>
void test_symm(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = make_spd<Mat>(n); // SPD is also self-adjoint
Mat B = Mat::Random(n, nrhs);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
auto d_B = DeviceMatrix<Scalar>::fromHost(B);
DeviceMatrix<Scalar> d_C;
d_C = d_A.template selfadjointView<UpLo>() * d_B;
Mat C = d_C.toHost();
Mat C_ref = A * B; // A is symmetric, so full multiply == symm
RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm();
VERIFY((C - C_ref).norm() < tol);
}
// ---- SYRK/HERK: rankUpdate(A) → C = A * A^H --------------------------------
template <typename Scalar>
void test_syrk(Index n, Index k) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, k);
auto d_A = DeviceMatrix<Scalar>::fromHost(A);
DeviceMatrix<Scalar> d_C;
d_C.template selfadjointView<Lower>().rankUpdate(d_A);
Mat C = d_C.toHost();
// Only lower triangle is meaningful for SYRK. Compare lower triangle.
Mat C_ref = A * A.adjoint();
// Extract lower triangle for comparison.
Mat C_lower = C.template triangularView<Lower>();
Mat C_ref_lower = C_ref.template triangularView<Lower>();
RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
VERIFY((C_lower - C_ref_lower).norm() < tol);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_gemm_basic<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_basic<Scalar>(128, 64, 32));
CALL_SUBTEST(test_gemm_basic<Scalar>(1, 1, 1));
CALL_SUBTEST(test_gemm_basic<Scalar>(256, 256, 256));
CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(128, 32, 64));
CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(128, 32, 64));
CALL_SUBTEST(test_gemm_scaled<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_scaled_rhs<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_accumulate<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_accumulate_empty<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_subtract<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_subtract_empty<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_dimension_mismatch<Scalar>());
CALL_SUBTEST(test_gemm_explicit_context<Scalar>(64, 64, 64));
CALL_SUBTEST(test_gemm_cross_context_reuse<Scalar>(64));
CALL_SUBTEST(test_gemm_cross_context_resize<Scalar>());
CALL_SUBTEST(test_gemm_chain<Scalar>(64));
CALL_SUBTEST(test_gemm_identity<Scalar>(64));
// Solver expressions — zero-size edge cases (use dedicated tests, not residual-based)
// Solver expressions
CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 1));
CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 4));
CALL_SUBTEST(test_llt_solve_expr<Scalar>(256, 8));
CALL_SUBTEST(test_llt_solve_expr_context<Scalar>(64, 4));
CALL_SUBTEST(test_llt_solve_upper<Scalar>(64, 4));
CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 1));
CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 4));
CALL_SUBTEST(test_lu_solve_expr<Scalar>(256, 8));
CALL_SUBTEST(test_lu_solve_expr_context<Scalar>(64, 4));
CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(64));
CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(0));
CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(64));
CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(0));
CALL_SUBTEST(test_gemm_then_solve<Scalar>(64));
// TRSM
CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 1)));
CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 4)));
CALL_SUBTEST((test_trsm<Scalar, Upper>(64, 4)));
CALL_SUBTEST((test_trsm<Scalar, Lower>(256, 8)));
// SYMM/HEMM
CALL_SUBTEST((test_symm<Scalar, Lower>(64, 4)));
CALL_SUBTEST((test_symm<Scalar, Upper>(64, 4)));
CALL_SUBTEST((test_symm<Scalar, Lower>(128, 8)));
// SYRK/HERK
CALL_SUBTEST(test_syrk<Scalar>(64, 64));
CALL_SUBTEST(test_syrk<Scalar>(64, 32));
CALL_SUBTEST(test_syrk<Scalar>(128, 64));
}
// ---- Solver failure mode tests (not templated on Scalar) --------------------
void test_llt_not_spd() {
// Negative definite matrix — LLT factorization must fail.
MatrixXd A = -MatrixXd::Identity(8, 8);
MatrixXd B = MatrixXd::Random(8, 1);
auto d_A = DeviceMatrix<double>::fromHost(A);
auto d_B = DeviceMatrix<double>::fromHost(B);
DeviceMatrix<double> d_X;
VERIFY_RAISES_ASSERT(d_X = d_A.llt().solve(d_B));
}
void test_lu_singular() {
// Zero matrix — LU factorization must detect singularity.
MatrixXd A = MatrixXd::Zero(8, 8);
MatrixXd B = MatrixXd::Random(8, 1);
auto d_A = DeviceMatrix<double>::fromHost(A);
auto d_B = DeviceMatrix<double>::fromHost(B);
DeviceMatrix<double> d_X;
VERIFY_RAISES_ASSERT(d_X = d_A.lu().solve(d_B));
}
EIGEN_DECLARE_TEST(gpu_cublas) {
CALL_SUBTEST(test_scalar<float>());
CALL_SUBTEST(test_scalar<double>());
CALL_SUBTEST(test_scalar<std::complex<float>>());
CALL_SUBTEST(test_scalar<std::complex<double>>());
CALL_SUBTEST(test_llt_not_spd());
CALL_SUBTEST(test_lu_singular());
}